diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2023-05-16 14:11:51 +0100 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2023-05-16 14:11:51 +0100 |
commit | 953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4 (patch) | |
tree | 08a4c9cd4f10a12927f7c57056617672369ed9b6 /src/insinuations.py | |
parent | 0faf95d56815d310290d3533d81d888deb7731f0 (diff) | |
download | UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.tar.gz UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.zip |
Added alt text, docker
Diffstat (limited to 'src/insinuations.py')
-rw-r--r-- | src/insinuations.py | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/src/insinuations.py b/src/insinuations.py new file mode 100644 index 0000000..107c84e --- /dev/null +++ b/src/insinuations.py @@ -0,0 +1,86 @@ +from lxml import html +import database +import datetime +import requests +import os + +def get_sics(db: database.PayGapDatabase, url = "https://resources.companieshouse.gov.uk/sic/"): + req = requests.get(url) + tree = html.fromstring(req.content.decode()) + bigtable = tree.xpath("/html/body/main/table/tbody")[0] + for tr_elem in bigtable.getchildren(): + td_code, td_description = tr_elem + + if td_code.getchildren() != []: + # if contains a <strong> element which indicates a section + current_section_code = td_code.getchildren()[0].text.replace("Section ", "").strip() + current_section_description = td_description.getchildren()[0].text.strip() + + db.append_sic_sections(current_section_code, current_section_description) + + else: + sic_code = int(td_code.text) + sic_desc = td_description.text.rstrip() + db.append_sic(sic_code, sic_desc, current_section_code) + +def get_companyinfo_url(company_number, url = "https://find-and-update.company-information.service.gov.uk/company/%s"): + if company_number.isdigit(): + company_number = "%08d" % int(company_number) + + return url % company_number + +def lookup_company(company_number): + company = {} + req = requests.get( + get_companyinfo_url(company_number), + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.50.2 (KHTML, like Gecko) Version/5.0.6 Safari/533.22.3'} + ) + + if req.status_code not in [200, 404]: + raise ConnectionError("Couldn't connect- it %d'd. Was looking for company %s" % (req.status_code, company_number)) + + tree = html.fromstring(req.content.decode()) + + status_elem = tree.xpath('//*[@id="company-status"]') + if len(status_elem) == 1: + company["status"] = status_elem[0].text.strip() + else: + company["status"] = None + + incorp_elem = tree.xpath('//*[@id="company-creation-date"]') + if len(incorp_elem) == 1: + company["incorporated"] = datetime.datetime.strptime(incorp_elem[0].text.strip(), "%d %B %Y") + else: + company["incorporated"] = None + + type_elem = tree.xpath('//*[@id="company-type"]') + if len(type_elem) == 1: + company["type_"] = type_elem[0].text.strip() + else: + company["type_"] = None + + company["sics"] = set() + for i in range(9): + sic_elem = tree.xpath('//*[@id="sic%d"]' % i) + if len(sic_elem) == 1: + company["sics"].add(int(sic_elem[0].text.strip().split(" - ")[0])) + else: + break + + return company + +if __name__ == "__main__": + if not os.path.exists(".docker"): + import dotenv + dotenv.load_dotenv(dotenv_path = "db.env") + host = "localhost" + else: + host = "db" + + # with database.PayGapDatabase(host = host) as db: + # print(db.search_company("University")) + + import app + counties = [feature["properties"]["name"] for feature in app.UK_GEOJSON["features"]] + print(counties) + print(len(counties))
\ No newline at end of file |