aboutsummaryrefslogtreecommitdiffstats
path: root/parser.py
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2023-04-28 12:03:19 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2023-04-28 12:03:19 +0100
commit43ba613b9e1ea9dbf6f94361d43418c91c8c0785 (patch)
tree4c327162e72e49ff549bfb77e77f047f99e005b9 /parser.py
parenta90ca1f2b50d343d0d2d0d3bda2ecf7712726419 (diff)
downloadUKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.tar.gz
UKGenderPayGap-43ba613b9e1ea9dbf6f94361d43418c91c8c0785.zip
Finished insinuations, started on HTML and CSS
Diffstat (limited to 'parser.py')
-rw-r--r--parser.py81
1 files changed, 81 insertions, 0 deletions
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..2918d73
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,81 @@
+import insinuations
+import database
+import datetime
+import time
+import json
+import csv
+import sys
+import os
+
+def parse_csv(db, csv_path):
+ insinuations.get_sics(db)
+
+ with open(csv_path, "r") as f:
+ num_lines = len(f.readlines())
+ i = 0
+
+ with open(csv_path, "r") as f:
+ reader = csv.reader(f)
+ headers = next(reader)
+
+ for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \
+ diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
+ female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \
+ current_name, submitted_after_deadline, duedate, submitted_date in reader:
+
+ if company_id.strip() != "":
+ try:
+ sic_codes = {int(i.strip()) for i in sic_codes.split(",")}
+ except ValueError:
+ sic_codes = set()
+
+ while True:
+ try:
+ company = insinuations.lookup_company(company_id)
+ except ConnectionError as e:
+ print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e))
+ time.sleep(20)
+ else:
+ break
+
+ company["sics"] = company["sics"].union(sic_codes)
+ company["company_number"] = company_id
+ company["name"] = name
+ company["address"] = address
+ company["postcode"] = postcode
+ company["policy_link"] = policy_link
+ company["responsible_person"] = responsible_person
+ company["size"] = size
+ company["current_name"] = current_name
+
+ print("%.2f%%" % ((i / num_lines) * 100), company)
+ print(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ db.append_employer(**company)
+ db.append_pay_info(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ i += 1
+
+ # break
+
+if __name__ == "__main__":
+ if not os.path.exists(".docker"):
+ import dotenv
+ dotenv.load_dotenv(dotenv_path = "db.env")
+ host = "srv.home"
+ else:
+ host = "db"
+
+ with database.PayGapDatabase(host = host) as db:
+ parse_csv(db, sys.argv[1]) \ No newline at end of file