diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | app.py | 14 | ||||
-rw-r--r-- | database.py | 206 | ||||
-rw-r--r-- | db.env.example | 1 | ||||
-rw-r--r-- | insinuations.py | 82 | ||||
-rw-r--r-- | parser.py | 81 | ||||
-rw-r--r-- | requirements.txt | 4 | ||||
-rw-r--r-- | static/scripts.js | 1 | ||||
-rw-r--r-- | static/style.css | 42 | ||||
-rw-r--r-- | templates/index.html.j2 | 4 | ||||
-rw-r--r-- | templates/template.html.j2 | 44 |
11 files changed, 481 insertions, 0 deletions
@@ -1,3 +1,5 @@ +db.env + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -0,0 +1,14 @@ +import database +import flask + +app = flask.Flask(__name__) + +@app.route("/") +def serve_index(): + return flask.render_template( + "index.html.j2", + title = "UK Gender Pay Gap" + ) + +if __name__ == "__main__": + app.run("0.0.0.0", port = 5005, debug = True)
\ No newline at end of file diff --git a/database.py b/database.py new file mode 100644 index 0000000..26d2683 --- /dev/null +++ b/database.py @@ -0,0 +1,206 @@ +from dataclasses import dataclass +import datetime +import pymysql +import os + +@dataclass +class PayGapDatabase: + + host: str = "db" + user: str = "root" + passwd: str = None + db: str = "paygap" + port: int = 3306 + + def __enter__(self): + if self.passwd is None: + self.passwd = os.environ["MYSQL_ROOT_PASSWORD"] + + try: + self.__connection = self.__get_connection() + except Exception as e: + print(e) + if e.args[0] == 1049: + self.__connection = self.__build_db() + return self + + def __exit__(self, type, value, traceback): + self.__connection.close() + + def __get_connection(self): + return pymysql.connect( + host = self.host, + port = self.port, + user = self.user, + passwd = self.passwd, + charset = "utf8mb4", + database = self.db + ) + + def __build_db(self): + print("Building database...") + self.__connection = pymysql.connect( + host = self.host, + port = self.port, + user = self.user, + passwd = self.passwd, + charset = "utf8mb4", + ) + with self.__connection.cursor() as cursor: + # unsafe: + cursor.execute("CREATE DATABASE %s" % self.db) + cursor.execute("USE %s" % self.db) + + cursor.execute(""" + CREATE TABLE sic_sections( + sic_section_id CHAR(1) NOT NULL PRIMARY KEY, + sic_section_name VARCHAR(128) NOT NULL + ); + """) + + cursor.execute(""" + CREATE TABLE sic( + sic_code INT UNSIGNED NOT NULL PRIMARY KEY, + sic_description VARCHAR(512) NOT NULL, + sic_section CHAR(1) NOT NULL, + FOREIGN KEY (sic_section) REFERENCES sic_sections(sic_section_id) + ); + """) + + cursor.execute(""" + CREATE TABLE employer( + company_number CHAR(8) NOT NULL PRIMARY KEY, + name VARCHAR(512) NOT NULL, + address TEXT NOT NULL, + postcode VARCHAR(8) NOT NULL, + policy_link VARCHAR(256) NULL, + responsible_person VARCHAR(128) NOT NULL, + size VARCHAR(20) NOT NULL, + current_name VARCHAR(512) NULL, + status VARCHAR(32) NULL, + type_ VARCHAR(128) NULL, + incorporated DATETIME NULL + ) + """) + + cursor.execute(""" + CREATE TABLE employer_sic( + company_number CHAR(8) NOT NULL, + sic_code INT UNSIGNED NOT NULL, + PRIMARY KEY (company_number, sic_code), + FOREIGN KEY (company_number) REFERENCES employer(company_number), + FOREIGN KEY (sic_code) REFERENCES sic(sic_code) + ); + """) + + cursor.execute(""" + CREATE TABLE pay( + pay_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + company_number CHAR(8) NOT NULL, + source VARCHAR(64) NOT NULL, + date_submitted DATETIME NOT NULL, + DiffMeanHourlyPercent DECIMAL(8,3) NOT NULL, + DiffMedianHourlyPercent DECIMAL(8,3) NOT NULL, + DiffMeanBonusPercent DECIMAL(8,3) NOT NULL, + DiffMedianBonusPercent DECIMAL(8,3) NOT NULL, + MaleBonusPercent DECIMAL(8,3) NOT NULL, + FemaleBonusPercent DECIMAL(8,3) NOT NULL, + MaleLowerQuartile DECIMAL(8,3) NOT NULL, + FemaleLowerQuartile DECIMAL(8,3) NOT NULL, + MaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL, + FemaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL, + MaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL, + FemaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL, + MaleTopQuartile DECIMAL(8,3) NOT NULL, + FemaleTopQuartile DECIMAL(8,3) NOT NULL, + FOREIGN KEY (company_number) REFERENCES employer(company_number) + ); + """) + + self.__connection.commit() + return self.__connection + + def append_sic_sections(self, section_id, description): + # print("Section ID: '%s', Description: '%s'" % (section_id, description)) + with self.__connection.cursor() as cursor: + cursor.execute(""" + INSERT INTO sic_sections VALUES (%s, %s) ON DUPLICATE KEY UPDATE sic_section_name = %s; + """, (section_id, description, description)) + self.__connection.commit() + + def append_sic(self, code, description, section_id): + print("Appended code %d (%s) under section %s" % (code, description, section_id)) + with self.__connection.cursor() as cursor: + cursor.execute(""" + INSERT INTO sic VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE sic_description = %s, sic_section = %s; + """, (code, description, section_id, description, section_id)) + self.__connection.commit() + + def append_employer(self, company_number, name, address, postcode, policy_link, responsible_person, size, current_name, \ + status, type_, incorporated, sics): + + # print("incorporated: %s" % str(incorporated)) + # print("sics", sics) + # print("name: %s" % name) + with self.__connection.cursor() as cursor: + cursor.execute(""" + INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + name = %s, address = %s, postcode = %s, policy_link = %s, responsible_person = %s, size = %s, + current_name = %s, status = %s, type_ = %s, incorporated = %s; + """, ( + company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated, + name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated + )) + # sql = """INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated) + # VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');""" % ( + # company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated + # ) + # print(sql) + + self.append_employer_sics(company_number, sics) + self.__connection.commit() + + def append_pay_info(self, company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, \ + diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \ + female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \ + female_upper_middle_quartile, male_top_quartile, female_top_quartile): + + try: + float(diff_mean_hourly_percent) + except ValueError: + diff_mean_hourly_percent = None + + + with self.__connection.cursor() as cursor: + cursor.execute("DELETE FROM pay WHERE company_number = %s AND source = %s;", (company_number, source)) + + try: + cursor.execute(""" + INSERT INTO pay (company_number, source, date_submitted, DiffMeanHourlyPercent, DiffMedianHourlyPercent, + DiffMeanBonusPercent, DiffMedianBonusPercent, MaleBonusPercent, FemaleBonusPercent, MaleLowerQuartile, + FemaleLowerQuartile, MaleLowerMiddleQuartile, FemaleLowerMiddleQuartile, MaleUpperMiddleQuartile, + FemaleUpperMiddleQuartile, MaleTopQuartile, FemaleTopQuartile) VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ); + """, ( + company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, + diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, + female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, + female_upper_middle_quartile, male_top_quartile, female_top_quartile + )) + except pymysql.err.DataError: + return + + self.__connection.commit() + + + def append_employer_sics(self, company_number, sics): + with self.__connection.cursor() as cursor: + cursor.execute("DELETE FROM employer_sic WHERE company_number = %s", (company_number, )) + + for sic in sics: + cursor.execute("SELECT * FROM sic WHERE sic_code = %s", (sic, )) + if cursor.fetchone() != None: + cursor.execute("INSERT INTO employer_sic VALUES (%s, %s);", (company_number, sic)) diff --git a/db.env.example b/db.env.example new file mode 100644 index 0000000..ea8bd2d --- /dev/null +++ b/db.env.example @@ -0,0 +1 @@ +MYSQL_ROOT_PASSWORD=************ diff --git a/insinuations.py b/insinuations.py new file mode 100644 index 0000000..a94607b --- /dev/null +++ b/insinuations.py @@ -0,0 +1,82 @@ +from lxml import html +import database +import datetime +import requests +import os + +def get_sics(db: database.PayGapDatabase, url = "https://resources.companieshouse.gov.uk/sic/"): + req = requests.get(url) + tree = html.fromstring(req.content.decode()) + bigtable = tree.xpath("/html/body/main/table/tbody")[0] + for tr_elem in bigtable.getchildren(): + td_code, td_description = tr_elem + + if td_code.getchildren() != []: + # if contains a <strong> element which indicates a section + current_section_code = td_code.getchildren()[0].text.replace("Section ", "").strip() + current_section_description = td_description.getchildren()[0].text.strip() + + db.append_sic_sections(current_section_code, current_section_description) + + else: + sic_code = int(td_code.text) + sic_desc = td_description.text.rstrip() + db.append_sic(sic_code, sic_desc, current_section_code) + +def get_companyinfo_url(company_number, url = "https://find-and-update.company-information.service.gov.uk/company/%s"): + if company_number.isdigit(): + company_number = "%08d" % int(company_number) + + return url % company_number + +def lookup_company(company_number): + company = {} + req = requests.get( + get_companyinfo_url(company_number), + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.50.2 (KHTML, like Gecko) Version/5.0.6 Safari/533.22.3'} + ) + + if req.status_code not in [200, 404]: + raise ConnectionError("Couldn't connect- it %d'd. Was looking for company %s" % (req.status_code, company_number)) + + tree = html.fromstring(req.content.decode()) + + status_elem = tree.xpath('//*[@id="company-status"]') + if len(status_elem) == 1: + company["status"] = status_elem[0].text.strip() + else: + company["status"] = None + + incorp_elem = tree.xpath('//*[@id="company-creation-date"]') + if len(incorp_elem) == 1: + company["incorporated"] = datetime.datetime.strptime(incorp_elem[0].text.strip(), "%d %B %Y") + else: + company["incorporated"] = None + + type_elem = tree.xpath('//*[@id="company-type"]') + if len(type_elem) == 1: + company["type_"] = type_elem[0].text.strip() + else: + company["type_"] = None + + company["sics"] = set() + for i in range(9): + sic_elem = tree.xpath('//*[@id="sic%d"]' % i) + if len(sic_elem) == 1: + company["sics"].add(int(sic_elem[0].text.strip().split(" - ")[0])) + else: + break + + return company + +if __name__ == "__main__": + # if not os.path.exists(".docker"): + # import dotenv + # dotenv.load_dotenv(dotenv_path = "db.env") + # host = "srv.home" + # else: + # host = "db" + + # with database.PayGapDatabase(host = host) as db: + # get_sics(db) + print(lookup_company("02838054"))
\ No newline at end of file diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2918d73 --- /dev/null +++ b/parser.py @@ -0,0 +1,81 @@ +import insinuations +import database +import datetime +import time +import json +import csv +import sys +import os + +def parse_csv(db, csv_path): + insinuations.get_sics(db) + + with open(csv_path, "r") as f: + num_lines = len(f.readlines()) + i = 0 + + with open(csv_path, "r") as f: + reader = csv.reader(f) + headers = next(reader) + + for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \ + diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \ + female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \ + female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \ + current_name, submitted_after_deadline, duedate, submitted_date in reader: + + if company_id.strip() != "": + try: + sic_codes = {int(i.strip()) for i in sic_codes.split(",")} + except ValueError: + sic_codes = set() + + while True: + try: + company = insinuations.lookup_company(company_id) + except ConnectionError as e: + print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e)) + time.sleep(20) + else: + break + + company["sics"] = company["sics"].union(sic_codes) + company["company_number"] = company_id + company["name"] = name + company["address"] = address + company["postcode"] = postcode + company["policy_link"] = policy_link + company["responsible_person"] = responsible_person + company["size"] = size + company["current_name"] = current_name + + print("%.2f%%" % ((i / num_lines) * 100), company) + print( + company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"), + diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent, + male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile, + male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, + female_upper_middle_quartile, male_top_quartile, female_top_quartile + ) + db.append_employer(**company) + db.append_pay_info( + company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"), + diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent, + male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile, + male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, + female_upper_middle_quartile, male_top_quartile, female_top_quartile + ) + i += 1 + + # break + +if __name__ == "__main__": + if not os.path.exists(".docker"): + import dotenv + dotenv.load_dotenv(dotenv_path = "db.env") + host = "srv.home" + else: + host = "db" + + with database.PayGapDatabase(host = host) as db: + parse_csv(db, sys.argv[1])
\ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2eff776 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pymysql +lxml +python-dotenv +flask diff --git a/static/scripts.js b/static/scripts.js new file mode 100644 index 0000000..c6c3320 --- /dev/null +++ b/static/scripts.js @@ -0,0 +1 @@ +console.log("foo!")
\ No newline at end of file diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..be2f299 --- /dev/null +++ b/static/style.css @@ -0,0 +1,42 @@ +body { + font-family: Helvetica, sans-serif; +} + +header { + font-size: large; + padding-left: 1%; + /* font-weight: bold; */ +} + +header p { + font-size: small; +} + +a { + color: black; + font-weight: bold; + padding-top: 1px; +} + +aside { + width: 15%; + padding-left: 15px; + margin-left: 15px; + float: right; + /* border-left-color: rgb(189, 189, 189); + border-left-width: 2px; + border-left-style: groove; */ +} + +#main_content { + padding-left: 2.5%; + padding-right: 2.5; +} + +footer { + padding-left: 10%; + padding-right: 10%; + padding-top: 50px; + font-size: xx-small; + justify-content: center; +}
\ No newline at end of file diff --git a/templates/index.html.j2 b/templates/index.html.j2 new file mode 100644 index 0000000..91f314e --- /dev/null +++ b/templates/index.html.j2 @@ -0,0 +1,4 @@ +{% extends "template.html.j2" %} +{% block content %} + <p>foo</p> +{% endblock %}
\ No newline at end of file diff --git a/templates/template.html.j2 b/templates/template.html.j2 new file mode 100644 index 0000000..1fb238d --- /dev/null +++ b/templates/template.html.j2 @@ -0,0 +1,44 @@ +<!DOCTYPE HTML> +<html> +<head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> + <title>UK Gender Pay Gap :: {{ title }}</title> + <!-- JQuery and JQurty UI current version --> + <script src='https://code.jquery.com/jquery-3.6.0.js'></script> + <script src="https://code.jquery.com/ui/1.13.1/jquery-ui.js"></script> + + <!-- Highcharts libraries --> + <script type="text/javascript" src="https://code.highcharts.com/highcharts.js"></script> + <script type="text/javascript" src="https://code.highcharts.com/highcharts-more.js"></script> + <script type="text/javascript" src="https://code.highcharts.com/modules/exporting.js"></script> + <script type="text/javascript" src="https://code.highcharts.com/modules/export-data.js"></script> + <script type="text/javascript" src="https://code.highcharts.com/modules/accessibility.js"></script> + + <script type="text/javascript" src="https://code.highcharts.com/maps/modules/map.js"></script> + <script type="text/javascript" src="https://code.highcharts.com/mapdata/custom/world-robinson.js"></script> + + <!-- local Javascript files --> + <script type="text/javascript" src="{{ url_for('static', filename='scripts.js') }}"></script> + <!-- remote and local CSS stylesheet --> + <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}"> +</head> + +<body> + <header> + <h1>{{ title }}</h1> + <p>Data provided by the <a href="https://gender-pay-gap.service.gov.uk/">UK Government</a></p> + </header> + + <aside> + <h4>Filter...</h4> + </aside> + + <div id="main_content"> + {% block content %} + {% endblock %} + </div> + + <footer> + <p>Source code released under GPLv3</p> + </footer> +</body>
\ No newline at end of file |