From 0faf95d56815d310290d3533d81d888deb7731f0 Mon Sep 17 00:00:00 2001
From: jwansek <eddie.atten.ea29@gmail.com>
Date: Tue, 16 May 2023 00:05:27 +0100
Subject: Added heatmap

---
 README.md         |  6 +++-
 app.py            | 25 +++++++++++++++-
 charts.json       | 11 +++++--
 database.py       | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 parser.py         |  6 +++-
 requirements.txt  |  1 +
 static/scripts.js | 74 ++++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 201 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index ec07041..ecd44d4 100644
--- a/README.md
+++ b/README.md
@@ -9,4 +9,8 @@ Visualisations of the UK gender pay gap data. Publically avaliable, not often se
 python3 parser.py ../RawData/UK\ Gender\ Pay\ Gap\ Data\ -\ 2019\ to\ 2020.csv
 ```
 
- - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us.
+ - [ONS data](https://www.ons.gov.uk/aboutus/transparencyandgovernance/freedomofinformationfoi/ukpostcodestownsandcounties) is required to turn the address postcode to a county or unitary authority for the map. This data is rather complicated, thankfully the [Borough of Camden](https://www.data.gov.uk/dataset/7ec10db7-c8f4-4a40-8d82-8921935b4865/national-statistics-postcode-lookup-uk) has processed this data nicely for us. To append the county or local authority data, give `parser.py` a file downloaded from the above with the exact name:
+
+```
+python3 parser.py ../RawData/National_Statistics_Postcode_Lookup_UK.csv
+```
diff --git a/app.py b/app.py
index 4f354f7..d5d6ef1 100644
--- a/app.py
+++ b/app.py
@@ -86,6 +86,20 @@ def api_get_sic_section_pay():
         
         return flask.jsonify(db.get_pay_by_sic_section(pay_type, year))
 
+@app.route("/api/heatmap")
+def api_get_heatmap_data():
+    # pay_type = flask.request.args.get("Pay Type")
+    year = flask.request.args.get("year")
+    # print("year: '%s'" % year)
+    # if pay_type is None or pay_type.lower() not in {'hourly', 'bonuses'}:
+    #     return flask.abort(400, "The key `pay type` must be equal to 'hourly' or 'bonuses'")
+    with database.PayGapDatabase(host = host) as db:
+        if year is not None:
+            if year not in db.get_years():
+                return flask.abort(400, "Unrecognised year '%s'. The year option must be in %s" % (year, ", ".join(db.get_years())))
+        
+        return flask.jsonify(db.get_heatmap_data("hourly", year))
+
 @app.route("/api/getyears")
 def api_get_year_options():
     with database.PayGapDatabase(host = host) as db:
@@ -107,17 +121,25 @@ def search():
 
 def get_chart_elem(url):
     for i in get_charts()["index"]:
-        print(i["url"], url)
+        # print(i["url"], url)
         # if i["url"] == url:
         #     return i
         if url.startswith(i["url"]):
             return i
 
+def get_chart_elem_strict(url):
+    for i in get_charts()["index"]:
+        print(urllib.parse.urlsplit(i["url"]).path, urllib.parse.urlsplit(url).path)
+        if urllib.parse.urlsplit(i["url"]).path == urllib.parse.urlsplit(url).path:
+            return i
+
 @app.route("/plot/<name>")
 def serve_large_plot(name):
     with database.PayGapDatabase(host = host) as db:
         # print(flask.request.full_path)
         elem = get_chart_elem(flask.request.full_path)
+        # if elem is None:
+        #     elem = get_chart_elem_strict(flask.request.full_path)
         filters = elem["filters"]
         for k, v in filters.items():
             if v == "<SICType>":
@@ -132,6 +154,7 @@ def serve_large_plot(name):
     elem["url"] = flask.request.full_path
     # print("elem", elem)
     current_filters = dict(flask.request.args)
+    print("filters", filters)
     print("current_filters", current_filters)
     return flask.render_template(
         "plot.html.j2",
diff --git a/charts.json b/charts.json
index 91f2ab7..4b266aa 100644
--- a/charts.json
+++ b/charts.json
@@ -1,5 +1,12 @@
 {
     "index": [
+        {
+            "title": "Median Hourly Pay by County or Local Authority: Choropleth Map",
+            "url": "/plot/heatmap",
+            "filters": {
+                "Year": "<Years>"
+            }
+        },
         {
             "title": "Median Hourly Pay Difference by Year",
             "url": "/plot/years?Pay+Type=Hourly",
@@ -33,7 +40,7 @@
             }
         },
         {
-            "title": "Median Hourly pay Difference by SIC Section",
+            "title": "Median Hourly Pay Difference by SIC Section",
             "url": "/plot/sic_sec?Pay+Type=Hourly",
             "filters": {
                 "Pay Type": {
@@ -47,7 +54,7 @@
             }
         },
         {
-            "title": "Median Bonus pay Difference by SIC Section",
+            "title": "Median Bonus Pay Difference by SIC Section",
             "url": "/plot/sic_sec?Pay+Type=Bonuses",
             "filters": {
                 "Pay Type": {
diff --git a/database.py b/database.py
index 80edd3c..d67b96c 100644
--- a/database.py
+++ b/database.py
@@ -2,11 +2,14 @@ from dataclasses import dataclass
 import operator
 import datetime
 import pymysql
+import pandas
+import app
 import os
 
 @dataclass
 class PayGapDatabase:
     
+    postcode_lookup_obj = None
     host: str = "db"
     user: str = "root"
     passwd: str = None
@@ -336,6 +339,83 @@ class PayGapDatabase:
         
         return sorted(pay, key = operator.itemgetter(1), reverse = True)
 
+    def get_heatmap_data(self, pay_type, year = None):
+        sql = "SELECT  insinuated_loc, COUNT(insinuated_loc), -AVG("
+        if pay_type.lower() == "hourly":
+            sql += "DiffMedianHourlyPercent"
+        elif pay_type.lower() == "bonuses":
+            sql += "DiffMedianBonusPercent"
+        sql += """
+        ) FROM employer INNER JOIN pay ON pay.company_number = employer.company_number
+        WHERE insinuated_loc_type IS NOT NULL
+        """
+        if year is not None:
+            sql += " AND source LIKE %s"
+
+        sql += " GROUP BY insinuated_loc;"
+
+        with self.__connection.cursor() as cursor:
+            if year is None:
+                cursor.execute(sql)
+            else:
+                cursor.execute(sql, ("%" + year.replace("-", "to") + "%", ))
+
+            return [[i[0], i[1], float(i[2])] for i in cursor.fetchall()]
+
+    def _get_postcode_lookup_obj(self, path_):
+        return pandas.read_csv(path_)
+
+    def _get_counties(self):
+        return {feature["properties"]["name"] for feature in app.UK_GEOJSON["features"]}
+
+    def append_counties(self, path_):
+        if self.postcode_lookup_obj is None:
+            self.postcode_lookup_obj = self._get_postcode_lookup_obj(path_)
+
+        counties = self._get_counties()
+        postcodes = self._get_postcodes()
+
+        with self.__connection.cursor() as cursor:
+
+            cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc VARCHAR(69) DEFAULT NULL;")
+            cursor.execute("ALTER TABLE employer ADD COLUMN IF NOT EXISTS insinuated_loc_type VARCHAR(25) DEFAULT NULL;")
+        
+            for i, j in enumerate(postcodes, 1):
+                id_, postcode = j
+                found_locations = self.postcode_lookup_obj[
+                    (self.postcode_lookup_obj["Postcode 1"] == postcode) | 
+                    (self.postcode_lookup_obj["Postcode 2"] == postcode) | 
+                    (self.postcode_lookup_obj["Postcode 3"] == postcode)
+                ]
+                if len(found_locations) == 1:
+                    county, la = found_locations[["County Name", "Local Authority Name"]].values[0]
+                    if la in counties:
+                        cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'Local Authority' WHERE company_number = %s", (la, id_))
+
+                        print("[%d/%d] Using local authority '%s' for postcode '%s'" % (i, len(postcodes), la, postcode))
+                    elif county in counties:
+                        cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", (county, id_))
+
+                        print("[%d/%d] Using county '%s' for postcode '%s'" % (i, len(postcodes), county, postcode))
+                    elif "Northamptonshire" in la:
+                        print("Manually fixing Northamptonshire...")
+                        cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Northamptonshire", id_))
+                    elif "Bournemouth" in la:
+                        print("Manually fixing Bournemouth...")
+                        cursor.execute("UPDATE employer SET insinuated_loc = %s, insinuated_loc_type = 'County' WHERE company_number = %s", ("Bournemouth", id_))
+                    else:
+                        print("[%d/%d] Didn't recoginse the local authority '%s' or the county '%s'" % (i, len(postcodes), la, county))
+                else:
+                    print("[%d/%d] Couldn't find a county for postcode '%s' (company id '%s')" % (i, len(postcodes), postcode, id_))
+
+                # break
+        self.__connection.commit()
+
+    def _get_postcodes(self):
+        with self.__connection.cursor() as cursor:
+            cursor.execute("SELECT company_number, TRIM(SUBSTRING_INDEX(address, ',', -1)) FROM employer;")
+            return cursor.fetchall()
+
 
 
 if __name__ == "__main__":
@@ -347,6 +427,8 @@ if __name__ == "__main__":
         host = "db"
 
     with PayGapDatabase(host = host) as db:
-        print(db.get_years())
-        print(db.get_pay_by_sic_section("bonuses", None))
+        # print(db.get_years())
+        # print(db.get_pay_by_sic_section("bonuses", None))
+        print(db.get_heatmap_data("hourly", db.get_years()[0]))
+        # print(db.append_counties())
 
diff --git a/parser.py b/parser.py
index 2918d73..f9e3e81 100644
--- a/parser.py
+++ b/parser.py
@@ -78,4 +78,8 @@ if __name__ == "__main__":
         host = "db"
 
     with database.PayGapDatabase(host = host) as db:
-        parse_csv(db, sys.argv[1])
\ No newline at end of file
+        p = sys.argv[1]
+        if os.path.basename(p) == "National_Statistics_Postcode_Lookup_UK.csv":
+            db.append_counties(p)
+        else:
+            parse_csv(db, p)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5e5339e..0dad989 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ lxml
 python-dotenv
 flask
 requests
+pandas
diff --git a/static/scripts.js b/static/scripts.js
index 2b1e47e..9b52215 100644
--- a/static/scripts.js
+++ b/static/scripts.js
@@ -11,6 +11,7 @@ function collapseTogglePress(elem, a_elem, num_hidden) {
 const PLOT_FUNC_MAPPINGS = {
     "years": draw_plot_years,
     "sic_sec": draw_plot_sic_sections,
+    "heatmap": draw_heatmap
 }
 
 $(document).ready(function() {
@@ -83,7 +84,7 @@ function form_api_url(containerName, filters) {
     //         }
     //     }       
     // }
-    // console.log("fetching ", url.toString());
+    console.log("fetching ", url.toString());
     return url.toString();
 }
 
@@ -244,4 +245,75 @@ function draw_plot_sic_sections(containerName, filters) {
             })
         })
     })
+}
+
+function draw_heatmap(containerName, filters) {
+    fetch(form_api_url(containerName, filters)).then(resp => {
+
+        const isPreview = (containerName.substring(1, 6) === "chart");
+
+        resp.json().then(data => {
+
+            var data2 = [];
+            data.forEach(row => {
+                data2.push([row[0], row[2]]);
+            });
+            console.log(data2);
+
+            $.getJSON("/static/ukcounties.json", function(geojson) {
+
+                console.log(geojson);
+
+                Highcharts.mapChart(containerName, {
+                    chart: {
+                        map: geojson
+                    },
+            
+                    title: {
+                        text: null
+                    },
+            
+                    accessibility: {
+                        typeDescription: 'Map of the United Kingdom.'
+                    },
+            
+                    mapNavigation: {
+                        enabled: isPreview,
+                        buttonOptions: {
+                            verticalAlign: 'bottom'
+                        }
+                    },
+
+                    legend: {
+                        enabled: isPreview,
+                        layout: 'vertical',
+                        align: 'right',
+                        verticalAlign: 'middle',
+                        itemMarginTop: 10,
+                        itemMarginBottom: 10
+                    },
+            
+                    colorAxis: {
+                        stops: [
+                            [0, '#c4463a'],
+                            [0.5, '#e6ffee'],
+                            [0.9, '#009933']
+                        ],
+                        min: -15
+                    },
+            
+                    series: [{
+                        data: data2,
+                        keys: ['name', 'value'],
+                        joinBy: 'name',
+                        name: 'Pay Gap',
+                        color: 'Green',
+                        tooltip: {
+                            valueSuffix: '%'
+                        }
+                    }]
+                });
+            });       
+        });
+    })
 }
\ No newline at end of file
-- 
cgit v1.2.3