aboutsummaryrefslogtreecommitdiffstats
path: root/parser.py
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2023-05-16 14:11:51 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2023-05-16 14:11:51 +0100
commit953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4 (patch)
tree08a4c9cd4f10a12927f7c57056617672369ed9b6 /parser.py
parent0faf95d56815d310290d3533d81d888deb7731f0 (diff)
downloadUKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.tar.gz
UKGenderPayGap-953dc5ef8652a0b7e8ae2c7db3535f1bd157cdd4.zip
Added alt text, docker
Diffstat (limited to 'parser.py')
-rw-r--r--parser.py85
1 files changed, 0 insertions, 85 deletions
diff --git a/parser.py b/parser.py
deleted file mode 100644
index f9e3e81..0000000
--- a/parser.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import insinuations
-import database
-import datetime
-import time
-import json
-import csv
-import sys
-import os
-
-def parse_csv(db, csv_path):
- insinuations.get_sics(db)
-
- with open(csv_path, "r") as f:
- num_lines = len(f.readlines())
- i = 0
-
- with open(csv_path, "r") as f:
- reader = csv.reader(f)
- headers = next(reader)
-
- for name, id_, address, postcode, company_id, sic_codes, diff_mean_hourly_percent, diff_median_hourly_percent, \
- diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
- female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
- female_upper_middle_quartile, male_top_quartile, female_top_quartile, policy_link, responsible_person, size, \
- current_name, submitted_after_deadline, duedate, submitted_date in reader:
-
- if company_id.strip() != "":
- try:
- sic_codes = {int(i.strip()) for i in sic_codes.split(",")}
- except ValueError:
- sic_codes = set()
-
- while True:
- try:
- company = insinuations.lookup_company(company_id)
- except ConnectionError as e:
- print("Couldn't connect... Error: '%s'... Waiting 20 seconds..." % str(e))
- time.sleep(20)
- else:
- break
-
- company["sics"] = company["sics"].union(sic_codes)
- company["company_number"] = company_id
- company["name"] = name
- company["address"] = address
- company["postcode"] = postcode
- company["policy_link"] = policy_link
- company["responsible_person"] = responsible_person
- company["size"] = size
- company["current_name"] = current_name
-
- print("%.2f%%" % ((i / num_lines) * 100), company)
- print(
- company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
- diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
- male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
- male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
- female_upper_middle_quartile, male_top_quartile, female_top_quartile
- )
- db.append_employer(**company)
- db.append_pay_info(
- company_id, os.path.basename(csv_path), datetime.datetime.strptime(submitted_date, "%Y/%m/%d %H:%M:%S"),
- diff_mean_hourly_percent, diff_median_hourly_percent, diff_mean_bonus_percent, diff_median_bonus_percent,
- male_bonus_percent, female_bonus_percent, male_lower_quartile, female_lower_quartile,
- male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
- female_upper_middle_quartile, male_top_quartile, female_top_quartile
- )
- i += 1
-
- # break
-
-if __name__ == "__main__":
- if not os.path.exists(".docker"):
- import dotenv
- dotenv.load_dotenv(dotenv_path = "db.env")
- host = "srv.home"
- else:
- host = "db"
-
- with database.PayGapDatabase(host = host) as db:
- p = sys.argv[1]
- if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv":
- db.append_counties(p)
- else:
- parse_csv(db, p) \ No newline at end of file