aboutsummaryrefslogtreecommitdiffstats
path: root/src/parser.py
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2023-05-16 14:11:51 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2023-05-16 14:11:51 +0100
commit953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4 (patch)
tree08a4c9cd4f10a12927f7c57056617672369ed9b6 /src/parser.py
parent0faf95d56815d310290d3533d81d888deb7731f0 (diff)
downloadUKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.tar.gz
UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.zip
Added alt text, docker
Diffstat (limited to 'src/parser.py')
-rw-r--r--src/parser.py85
1 files changed, 85 insertions, 0 deletions
diff --git a/src/parser.py b/src/parser.py
new file mode 100644
index 0000000..f9e3e81
--- /dev/null
+++ b/src/parser.py
@@ -0,0 +1,85 @@
+import insinuations
+import database
+import datetime
+import time
+import json
+import csv
+import sys
+import os
+
+def parse_csv(db, csv_path):
+ insinuations.get_sics(db)
+
+ with open(csv_path, "r") as f:
+ num_lines = len(f.readlines())
+ i = 0
+
+ with open(csv_path, "r") as f:
+ reader = csv.reader(f)
+ headers = next(reader)
+
+ for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \
+ diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
+ female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \
+ current_name, submitted_after_deadline, duedate, submitted_date in reader:
+
+ if company_id.strip() != "":
+ try:
+ sic_codes = {int(i.strip()) for i in sic_codes.split(",")}
+ except ValueError:
+ sic_codes = set()
+
+ while True:
+ try:
+ company = insinuations.lookup_company(company_id)
+ except ConnectionError as e:
+ print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e))
+ time.sleep(20)
+ else:
+ break
+
+ company["sics"] = company["sics"].union(sic_codes)
+ company["company_number"] = company_id
+ company["name"] = name
+ company["address"] = address
+ company["postcode"] = postcode
+ company["policy_link"] = policy_link
+ company["responsible_person"] = responsible_person
+ company["size"] = size
+ company["current_name"] = current_name
+
+ print("%.2f%%" % ((i / num_lines) * 100), company)
+ print(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ db.append_employer(**company)
+ db.append_pay_info(
+ company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
+ diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
+ male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
+ male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
+ female_upper_middle_quartile, male_top_quartile, female_top_quartile
+ )
+ i += 1
+
+ # break
+
+if __name__ == "__main__":
+ if not os.path.exists(".docker"):
+ import dotenv
+ dotenv.load_dotenv(dotenv_path = "db.env")
+ host = "srv.home"
+ else:
+ host = "db"
+
+ with database.PayGapDatabase(host = host) as db:
+ p = sys.argv[1]
+ if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv":
+ db.append_counties(p)
+ else:
+ parse_csv(db, p) \ No newline at end of file