aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2023-05-16 00:05:27 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2023-05-16 00:05:27 +0100
commit0faf95d56815d310290d3533d81d888deb7731f0 (patch)
treeef9fd281f8b85aab569944eb712235aa321ba768
parent469de094098c5cf149b9aeab9c2a7d23aa22c11d (diff)
downloadUKGenderPayGap-0faf95d56815d310290d3533d81d888deb7731f0.tar.gz
UKGenderPayGap-0faf95d56815d310290d3533d81d888deb7731f0.zip
Added heatmap
-rw-r--r--README.md6
-rw-r--r--app.py25
-rw-r--r--charts.json11
-rw-r--r--database.py86
-rw-r--r--parser.py6
-rw-r--r--requirements.txt1
-rw-r--r--static/scripts.js74
7 files changed, 201 insertions, 8 deletions
diff --git a/README.md b/README.md
index ec07041..ecd44d4 100644
--- a/README.md
+++ b/README.md
@@ -9,4 +9,8 @@ Visualisations of the UK gender pay gap data. Publically avaliable, not often se
python3 parser.py ../RawData/UK\ Gender\ Pay\ Gap\ Data\ -\ 2019\ to\ 2020.csv
```
- - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us.
+ - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us. To append the county or local authority data, give `parser.py` a file downloaded from the above with the exact name:
+
+```
+python3 parser.py ../RawData/National_Statistics_Postcode_Lookup_UK.csv
+```
diff --git a/app.py b/app.py
index 4f354f7..d5d6ef1 100644
--- a/app.py
+++ b/app.py
@@ -86,6 +86,20 @@ def api_get_sic_section_pay():
return flask.jsonify(db.get_pay_by_sic_section(pay_type, year))
+@app.route("/api/heatmap")
+def api_get_heatmap_data():
+ # pay_type = flask.request.args.get("Pay Type")
+ year = flask.request.args.get("year")
+ # print("year: '%s'" % year)
+ # if pay_type is None or pay_type.lower() not in {'hourly', 'bonuses'}:
+ # return flask.abort(400, "The key `pay type` must be equal to 'hourly' or 'bonuses'")
+ with database.PayGapDatabase(host = host) as db:
+ if year is not None:
+ if year not in db.get_years():
+ return flask.abort(400, "Unrecognised year '%s'. The year option must be in %s" % (year, ", ".join(db.get_years())))
+
+ return flask.jsonify(db.get_heatmap_data("hourly", year))
+
@app.route("/api/getyears")
def api_get_year_options():
with database.PayGapDatabase(host = host) as db:
@@ -107,17 +121,25 @@ def search():
def get_chart_elem(url):
for i in get_charts()["index"]:
- print(i["url"], url)
+ # print(i["url"], url)
# if i["url"] == url:
# return i
if url.startswith(i["url"]):
return i
+def get_chart_elem_strict(url):
+ for i in get_charts()["index"]:
+ print(urllib.parse.urlsplit(i["url"]).path, urllib.parse.urlsplit(url).path)
+ if urllib.parse.urlsplit(i["url"]).path == urllib.parse.urlsplit(url).path:
+ return i
+
@app.route("/plot/<name>")
def serve_large_plot(name):
with database.PayGapDatabase(host = host) as db:
# print(flask.request.full_path)
elem = get_chart_elem(flask.request.full_path)
+ # if elem is None:
+ # elem = get_chart_elem_strict(flask.request.full_path)
filters = elem["filters"]
for k, v in filters.items():
if v == "<SICType>":
@@ -132,6 +154,7 @@ def serve_large_plot(name):
elem["url"] = flask.request.full_path
# print("elem", elem)
current_filters = dict(flask.request.args)
+ print("filters", filters)
print("current_filters", current_filters)
return flask.render_template(
"plot.html.j2",
diff --git a/charts.json b/charts.json
index 91f2ab7..4b266aa 100644
--- a/charts.json
+++ b/charts.json
@@ -1,6 +1,13 @@
{
"index": [
{
+ "title": "Median Hourly Pay by County or Local Authority: Choropleth Map",
+ "url": "/plot/heatmap",
+ "filters": {
+ "Year": "<Years>"
+ }
+ },
+ {
"title": "Median Hourly Pay Difference by Year",
"url": "/plot/years?Pay+Type=Hourly",
"filters": {
@@ -33,7 +40,7 @@
}
},
{
- "title": "Median Hourly pay Difference by SIC Section",
+ "title": "Median Hourly Pay Difference by SIC Section",
"url": "/plot/sic_sec?Pay+Type=Hourly",
"filters": {
"Pay Type": {
@@ -47,7 +54,7 @@
}
},
{
- "title": "Median Bonus pay Difference by SIC Section",
+ "title": "Median Bonus Pay Difference by SIC Section",
"url": "/plot/sic_sec?Pay+Type=Bonuses",
"filters": {
"Pay Type": {
diff --git a/database.py b/database.py
index 80edd3c..d67b96c 100644
--- a/database.py
+++ b/database.py
@@ -2,11 +2,14 @@ from dataclasses import dataclass
import operator
import datetime
import pymysql
+import pandas
+import app
import os
@dataclass
class PayGapDatabase:
+ postcode_lookup_obj = None
host: str = "db"
user: str = "root"
passwd: str = None
@@ -336,6 +339,83 @@ class PayGapDatabase:
return sorted(pay, key = operator.itemgetter(1), reverse = True)
+ def get_heatmap_data(self, pay_type, year = None):
+ sql = "SELECT insinuated_loc, COUNT(insinuated_loc), -AVG("
+ if pay_type.lower() == "hourly":
+ sql += "DiffMedianHourlyPercent"
+ elif pay_type.lower() == "bonuses":
+ sql += "DiffMedianBonusPercent"
+ sql += """
+ ) FROM employer INNER JOIN pay ON pay.company_number = employer.company_number
+ WHERE insinuated_loc_type IS NOT NULL
+ """
+ if year is not None:
+ sql += " AND source LIKE %s"
+
+ sql += " GROUP BY insinuated_loc;"
+
+ with self.__connection.cursor() as cursor:
+ if year is None:
+ cursor.execute(sql)
+ else:
+ cursor.execute(sql, ("%" + year.replace("-", "to") + "%", ))
+
+ return [[i[0], i[1], float(i[2])] for i in cursor.fetchall()]
+
+ def _get_postcode_lookup_obj(self, path_):
+ return pandas.read_csv(path_)
+
+ def _get_counties(self):
+ return {feature["properties"]["name"] for feature in app.UK_GEOJSON["features"]}
+
+ def append_counties(self, path_):
+ if self.postcode_lookup_obj is None:
+ self.postcode_lookup_obj = self._get_postcode_lookup_obj(path_)
+
+ counties = self._get_counties()
+ postcodes = self._get_postcodes()
+
+ with self.__connection.cursor() as cursor:
+
+ cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc VARCHAR(69) DEFAULT NULL;")
+ cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc_type VARCHAR(25) DEFAULT NULL;")
+
+ for i, j in enumerate(postcodes, 1):
+ id_, postcode = j
+ found_locations = self.postcode_lookup_obj[
+ (self.postcode_lookup_obj["Postcode 1"] == postcode) |
+ (self.postcode_lookup_obj["Postcode 2"] == postcode) |
+ (self.postcode_lookup_obj["Postcode 3"] == postcode)
+ ]
+ if len(found_locations) == 1:
+ county, la = found_locations[["County Name", "Local Authority Name"]].values[0]
+ if la in counties:
+ cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'Local Authority' WHERE company_number = %s", (la, id_))
+
+ print("[%d/%d] Using local authority '%s' for postcode '%s'" % (i, len(postcodes), la, postcode))
+ elif county in counties:
+ cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", (county, id_))
+
+ print("[%d/%d] Using county '%s' for postcode '%s'" % (i, len(postcodes), county, postcode))
+ elif "Northamptonshire" in la:
+ print("Manually fixing Northamptonshire...")
+ cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Northamptonshire", id_))
+ elif "Bournemouth" in la:
+ print("Manually fixing Bournemouth...")
+ cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Bournemouth", id_))
+ else:
+ print("[%d/%d] Didn't recoginse the local authority '%s' or the county '%s'" % (i, len(postcodes), la, county))
+ else:
+ print("[%d/%d] Couldn't find a county for postcode '%s' (company id '%s')" % (i, len(postcodes), postcode, id_))
+
+ # break
+ self.__connection.commit()
+
+ def _get_postcodes(self):
+ with self.__connection.cursor() as cursor:
+ cursor.execute("SELECT company_number, TRIM(SUBSTRING_INDEX(address, ',', -1)) FROM employer;")
+ return cursor.fetchall()
+
if __name__ == "__main__":
@@ -347,6 +427,8 @@ if __name__ == "__main__":
host = "db"
with PayGapDatabase(host = host) as db:
- print(db.get_years())
- print(db.get_pay_by_sic_section("bonuses", None))
+ # print(db.get_years())
+ # print(db.get_pay_by_sic_section("bonuses", None))
+ print(db.get_heatmap_data("hourly", db.get_years()[0]))
+ # print(db.append_counties())
diff --git a/parser.py b/parser.py
index 2918d73..f9e3e81 100644
--- a/parser.py
+++ b/parser.py
@@ -78,4 +78,8 @@ if __name__ == "__main__":
host = "db"
with database.PayGapDatabase(host = host) as db:
- parse_csv(db, sys.argv[1]) \ No newline at end of file
+ p = sys.argv[1]
+ if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv":
+ db.append_counties(p)
+ else:
+ parse_csv(db, p) \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5e5339e..0dad989 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ lxml
python-dotenv
flask
requests
+pandas
diff --git a/static/scripts.js b/static/scripts.js
index 2b1e47e..9b52215 100644
--- a/static/scripts.js
+++ b/static/scripts.js
@@ -11,6 +11,7 @@ function collapseTogglePress(elem, a_elem, num_hidden) {
const PLOT_FUNC_MAPPINGS = {
"years": draw_plot_years,
"sic_sec": draw_plot_sic_sections,
+ "heatmap": draw_heatmap
}
$(document).ready(function() {
@@ -83,7 +84,7 @@ function form_api_url(containerName, filters) {
// }
// }
// }
- // console.log("fetching ", url.toString());
+ console.log("fetching ", url.toString());
return url.toString();
}
@@ -244,4 +245,75 @@ function draw_plot_sic_sections(containerName, filters) {
})
})
})
+}
+
+function draw_heatmap(containerName, filters) {
+ fetch(form_api_url(containerName, filters)).then(resp => {
+
+ const isPreview = (containerName.substring(1, 6) === "chart");
+
+ resp.json().then(data => {
+
+ var data2 = [];
+ data.forEach(row => {
+ data2.push([row[0], row[2]]);
+ });
+ console.log(data2);
+
+ $.getJSON("/static/ukcounties.json", function(geojson) {
+
+ console.log(geojson);
+
+ Highcharts.mapChart(containerName, {
+ chart: {
+ map: geojson
+ },
+
+ title: {
+ text: null
+ },
+
+ accessibility: {
+ typeDescription: 'Map of the United Kingdom.'
+ },
+
+ mapNavigation: {
+ enabled: isPreview,
+ buttonOptions: {
+ verticalAlign: 'bottom'
+ }
+ },
+
+ legend: {
+ enabled: isPreview,
+ layout: 'vertical',
+ align: 'right',
+ verticalAlign: 'middle',
+ itemMarginTop: 10,
+ itemMarginBottom: 10
+ },
+
+ colorAxis: {
+ stops: [
+ [0, '#c4463a'],
+ [0.5, '#e6ffee'],
+ [0.9, '#009933']
+ ],
+ min: -15
+ },
+
+ series: [{
+ data: data2,
+ keys: ['name', 'value'],
+ joinBy: 'name',
+ name: 'Pay Gap',
+ color: 'Green',
+ tooltip: {
+ valueSuffix: '%'
+ }
+ }]
+ });
+ });
+ });
+ })
} \ No newline at end of file