From 0faf95d56815d310290d3533d81d888deb7731f0 Mon Sep 17 00:00:00 2001 From: jwansek Date: Tue, 16 May 2023 00:05:27 +0100 Subject: Added heatmap --- README.md | 6 +++- app.py | 25 +++++++++++++++- charts.json | 11 +++++-- database.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- parser.py | 6 +++- requirements.txt | 1 + static/scripts.js | 74 ++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 201 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ec07041..ecd44d4 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,8 @@ Visualisations of the UK gender pay gap data. Publically avaliable, not often se python3 parser.py ../RawData/UK\ Gender\ Pay\ Gap\ Data\ -\ 2019\ to\ 2020.csv ``` - - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us. + - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us. To append the county or local authority data, give `parser.py` a file downloaded from the above with the exact name: + +``` +python3 parser.py ../RawData/National_Statistics_Postcode_Lookup_UK.csv +``` diff --git a/app.py b/app.py index 4f354f7..d5d6ef1 100644 --- a/app.py +++ b/app.py @@ -86,6 +86,20 @@ def api_get_sic_section_pay(): return flask.jsonify(db.get_pay_by_sic_section(pay_type, year)) +@app.route("/api/heatmap") +def api_get_heatmap_data(): + # pay_type = flask.request.args.get("Pay Type") + year = flask.request.args.get("year") + # print("year: '%s'" % year) + # if pay_type is None or pay_type.lower() not in {'hourly', 'bonuses'}: + # return flask.abort(400, "The key `pay type` must be equal to 'hourly' or 'bonuses'") + with database.PayGapDatabase(host = host) as db: + if year is not None: + if year not in db.get_years(): + return flask.abort(400, "Unrecognised year '%s'. The year option must be in %s" % (year, ", ".join(db.get_years()))) + + return flask.jsonify(db.get_heatmap_data("hourly", year)) + @app.route("/api/getyears") def api_get_year_options(): with database.PayGapDatabase(host = host) as db: @@ -107,17 +121,25 @@ def search(): def get_chart_elem(url): for i in get_charts()["index"]: - print(i["url"], url) + # print(i["url"], url) # if i["url"] == url: # return i if url.startswith(i["url"]): return i +def get_chart_elem_strict(url): + for i in get_charts()["index"]: + print(urllib.parse.urlsplit(i["url"]).path, urllib.parse.urlsplit(url).path) + if urllib.parse.urlsplit(i["url"]).path == urllib.parse.urlsplit(url).path: + return i + @app.route("/plot/") def serve_large_plot(name): with database.PayGapDatabase(host = host) as db: # print(flask.request.full_path) elem = get_chart_elem(flask.request.full_path) + # if elem is None: + # elem = get_chart_elem_strict(flask.request.full_path) filters = elem["filters"] for k, v in filters.items(): if v == "": @@ -132,6 +154,7 @@ def serve_large_plot(name): elem["url"] = flask.request.full_path # print("elem", elem) current_filters = dict(flask.request.args) + print("filters", filters) print("current_filters", current_filters) return flask.render_template( "plot.html.j2", diff --git a/charts.json b/charts.json index 91f2ab7..4b266aa 100644 --- a/charts.json +++ b/charts.json @@ -1,5 +1,12 @@ { "index": [ + { + "title": "Median Hourly Pay by County or Local Authority: Choropleth Map", + "url": "/plot/heatmap", + "filters": { + "Year": "" + } + }, { "title": "Median Hourly Pay Difference by Year", "url": "/plot/years?Pay+Type=Hourly", @@ -33,7 +40,7 @@ } }, { - "title": "Median Hourly pay Difference by SIC Section", + "title": "Median Hourly Pay Difference by SIC Section", "url": "/plot/sic_sec?Pay+Type=Hourly", "filters": { "Pay Type": { @@ -47,7 +54,7 @@ } }, { - "title": "Median Bonus pay Difference by SIC Section", + "title": "Median Bonus Pay Difference by SIC Section", "url": "/plot/sic_sec?Pay+Type=Bonuses", "filters": { "Pay Type": { diff --git a/database.py b/database.py index 80edd3c..d67b96c 100644 --- a/database.py +++ b/database.py @@ -2,11 +2,14 @@ from dataclasses import dataclass import operator import datetime import pymysql +import pandas +import app import os @dataclass class PayGapDatabase: + postcode_lookup_obj = None host: str = "db" user: str = "root" passwd: str = None @@ -336,6 +339,83 @@ class PayGapDatabase: return sorted(pay, key = operator.itemgetter(1), reverse = True) + def get_heatmap_data(self, pay_type, year = None): + sql = "SELECT insinuated_loc, COUNT(insinuated_loc), -AVG(" + if pay_type.lower() == "hourly": + sql += "DiffMedianHourlyPercent" + elif pay_type.lower() == "bonuses": + sql += "DiffMedianBonusPercent" + sql += """ + ) FROM employer INNER JOIN pay ON pay.company_number = employer.company_number + WHERE insinuated_loc_type IS NOT NULL + """ + if year is not None: + sql += " AND source LIKE %s" + + sql += " GROUP BY insinuated_loc;" + + with self.__connection.cursor() as cursor: + if year is None: + cursor.execute(sql) + else: + cursor.execute(sql, ("%" + year.replace("-", "to") + "%", )) + + return [[i[0], i[1], float(i[2])] for i in cursor.fetchall()] + + def _get_postcode_lookup_obj(self, path_): + return pandas.read_csv(path_) + + def _get_counties(self): + return {feature["properties"]["name"] for feature in app.UK_GEOJSON["features"]} + + def append_counties(self, path_): + if self.postcode_lookup_obj is None: + self.postcode_lookup_obj = self._get_postcode_lookup_obj(path_) + + counties = self._get_counties() + postcodes = self._get_postcodes() + + with self.__connection.cursor() as cursor: + + cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc VARCHAR(69) DEFAULT NULL;") + cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc_type VARCHAR(25) DEFAULT NULL;") + + for i, j in enumerate(postcodes, 1): + id_, postcode = j + found_locations = self.postcode_lookup_obj[ + (self.postcode_lookup_obj["Postcode 1"] == postcode) | + (self.postcode_lookup_obj["Postcode 2"] == postcode) | + (self.postcode_lookup_obj["Postcode 3"] == postcode) + ] + if len(found_locations) == 1: + county, la = found_locations[["County Name", "Local Authority Name"]].values[0] + if la in counties: + cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'Local Authority' WHERE company_number = %s", (la, id_)) + + print("[%d/%d] Using local authority '%s' for postcode '%s'" % (i, len(postcodes), la, postcode)) + elif county in counties: + cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", (county, id_)) + + print("[%d/%d] Using county '%s' for postcode '%s'" % (i, len(postcodes), county, postcode)) + elif "Northamptonshire" in la: + print("Manually fixing Northamptonshire...") + cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Northamptonshire", id_)) + elif "Bournemouth" in la: + print("Manually fixing Bournemouth...") + cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Bournemouth", id_)) + else: + print("[%d/%d] Didn't recoginse the local authority '%s' or the county '%s'" % (i, len(postcodes), la, county)) + else: + print("[%d/%d] Couldn't find a county for postcode '%s' (company id '%s')" % (i, len(postcodes), postcode, id_)) + + # break + self.__connection.commit() + + def _get_postcodes(self): + with self.__connection.cursor() as cursor: + cursor.execute("SELECT company_number, TRIM(SUBSTRING_INDEX(address, ',', -1)) FROM employer;") + return cursor.fetchall() + if __name__ == "__main__": @@ -347,6 +427,8 @@ if __name__ == "__main__": host = "db" with PayGapDatabase(host = host) as db: - print(db.get_years()) - print(db.get_pay_by_sic_section("bonuses", None)) + # print(db.get_years()) + # print(db.get_pay_by_sic_section("bonuses", None)) + print(db.get_heatmap_data("hourly", db.get_years()[0])) + # print(db.append_counties()) diff --git a/parser.py b/parser.py index 2918d73..f9e3e81 100644 --- a/parser.py +++ b/parser.py @@ -78,4 +78,8 @@ if __name__ == "__main__": host = "db" with database.PayGapDatabase(host = host) as db: - parse_csv(db, sys.argv[1]) \ No newline at end of file + p = sys.argv[1] + if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv": + db.append_counties(p) + else: + parse_csv(db, p) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5e5339e..0dad989 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ lxml python-dotenv flask requests +pandas diff --git a/static/scripts.js b/static/scripts.js index 2b1e47e..9b52215 100644 --- a/static/scripts.js +++ b/static/scripts.js @@ -11,6 +11,7 @@ function collapseTogglePress(elem, a_elem, num_hidden) { const PLOT_FUNC_MAPPINGS = { "years": draw_plot_years, "sic_sec": draw_plot_sic_sections, + "heatmap": draw_heatmap } $(document).ready(function() { @@ -83,7 +84,7 @@ function form_api_url(containerName, filters) { // } // } // } - // console.log("fetching ", url.toString()); + console.log("fetching ", url.toString()); return url.toString(); } @@ -244,4 +245,75 @@ function draw_plot_sic_sections(containerName, filters) { }) }) }) +} + +function draw_heatmap(containerName, filters) { + fetch(form_api_url(containerName, filters)).then(resp => { + + const isPreview = (containerName.substring(1, 6) === "chart"); + + resp.json().then(data => { + + var data2 = []; + data.forEach(row => { + data2.push([row[0], row[2]]); + }); + console.log(data2); + + $.getJSON("/static/ukcounties.json", function(geojson) { + + console.log(geojson); + + Highcharts.mapChart(containerName, { + chart: { + map: geojson + }, + + title: { + text: null + }, + + accessibility: { + typeDescription: 'Map of the United Kingdom.' + }, + + mapNavigation: { + enabled: isPreview, + buttonOptions: { + verticalAlign: 'bottom' + } + }, + + legend: { + enabled: isPreview, + layout: 'vertical', + align: 'right', + verticalAlign: 'middle', + itemMarginTop: 10, + itemMarginBottom: 10 + }, + + colorAxis: { + stops: [ + [0, '#c4463a'], + [0.5, '#e6ffee'], + [0.9, '#009933'] + ], + min: -15 + }, + + series: [{ + data: data2, + keys: ['name', 'value'], + joinBy: 'name', + name: 'Pay Gap', + color: 'Green', + tooltip: { + valueSuffix: '%' + } + }] + }); + }); + }); + }) } \ No newline at end of file -- cgit v1.2.3