1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
|
from dataclasses import dataclass
import operator
import datetime
import pymysql
import os
@dataclass
class PayGapDatabase:
host: str = "db"
user: str = "root"
passwd: str = None
db: str = "paygap"
port: int = 3306
def __enter__(self):
if self.passwd is None:
self.passwd = os.environ["MYSQL_ROOT_PASSWORD"]
try:
self.__connection = self.__get_connection()
except Exception as e:
print(e)
if e.args[0] == 1049:
self.__connection = self.__build_db()
return self
def __exit__(self, type, value, traceback):
self.__connection.close()
def __get_connection(self):
return pymysql.connect(
host = self.host,
port = self.port,
user = self.user,
passwd = self.passwd,
charset = "utf8mb4",
database = self.db
)
def __build_db(self):
print("Building database...")
self.__connection = pymysql.connect(
host = self.host,
port = self.port,
user = self.user,
passwd = self.passwd,
charset = "utf8mb4",
)
with self.__connection.cursor() as cursor:
# unsafe:
cursor.execute("CREATE DATABASE %s" % self.db)
cursor.execute("USE %s" % self.db)
cursor.execute("""
CREATE TABLE sic_sections(
sic_section_id CHAR(1) NOT NULL PRIMARY KEY,
sic_section_name VARCHAR(128) NOT NULL
);
""")
cursor.execute("""
CREATE TABLE sic(
sic_code INT UNSIGNED NOT NULL PRIMARY KEY,
sic_description VARCHAR(512) NOT NULL,
sic_section CHAR(1) NOT NULL,
FOREIGN KEY (sic_section) REFERENCES sic_sections(sic_section_id)
);
""")
cursor.execute("""
CREATE TABLE employer(
company_number CHAR(8) NOT NULL PRIMARY KEY,
name VARCHAR(512) NOT NULL,
address TEXT NOT NULL,
postcode VARCHAR(8) NOT NULL,
policy_link VARCHAR(256) NULL,
responsible_person VARCHAR(128) NOT NULL,
size VARCHAR(20) NOT NULL,
current_name VARCHAR(512) NULL,
status VARCHAR(32) NULL,
type_ VARCHAR(128) NULL,
incorporated DATETIME NULL
)
""")
cursor.execute("""
CREATE TABLE employer_sic(
company_number CHAR(8) NOT NULL,
sic_code INT UNSIGNED NOT NULL,
PRIMARY KEY (company_number, sic_code),
FOREIGN KEY (company_number) REFERENCES employer(company_number),
FOREIGN KEY (sic_code) REFERENCES sic(sic_code)
);
""")
cursor.execute("""
CREATE TABLE pay(
pay_id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
company_number CHAR(8) NOT NULL,
source VARCHAR(64) NOT NULL,
date_submitted DATETIME NOT NULL,
DiffMeanHourlyPercent DECIMAL(8,3) NOT NULL,
DiffMedianHourlyPercent DECIMAL(8,3) NOT NULL,
DiffMeanBonusPercent DECIMAL(8,3) NOT NULL,
DiffMedianBonusPercent DECIMAL(8,3) NOT NULL,
MaleBonusPercent DECIMAL(8,3) NOT NULL,
FemaleBonusPercent DECIMAL(8,3) NOT NULL,
MaleLowerQuartile DECIMAL(8,3) NOT NULL,
FemaleLowerQuartile DECIMAL(8,3) NOT NULL,
MaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL,
FemaleLowerMiddleQuartile DECIMAL(8,3) NOT NULL,
MaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL,
FemaleUpperMiddleQuartile DECIMAL(8,3) NOT NULL,
MaleTopQuartile DECIMAL(8,3) NOT NULL,
FemaleTopQuartile DECIMAL(8,3) NOT NULL,
FOREIGN KEY (company_number) REFERENCES employer(company_number)
);
""")
self.__connection.commit()
return self.__connection
def _wrap_percent(self, word):
return "%%%s%%" % (word)
def append_sic_sections(self, section_id, description):
# print("Section ID: '%s', Description: '%s'" % (section_id, description))
with self.__connection.cursor() as cursor:
cursor.execute("""
INSERT INTO sic_sections VALUES (%s, %s) ON DUPLICATE KEY UPDATE sic_section_name = %s;
""", (section_id, description, description))
self.__connection.commit()
def append_sic(self, code, description, section_id):
print("Appended code %d (%s) under section %s" % (code, description, section_id))
with self.__connection.cursor() as cursor:
cursor.execute("""
INSERT INTO sic VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE sic_description = %s, sic_section = %s;
""", (code, description, section_id, description, section_id))
self.__connection.commit()
def append_employer(self, company_number, name, address, postcode, policy_link, responsible_person, size, current_name, \
status, type_, incorporated, sics):
# print("incorporated: %s" % str(incorporated))
# print("sics", sics)
# print("name: %s" % name)
with self.__connection.cursor() as cursor:
cursor.execute("""
INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
name = %s, address = %s, postcode = %s, policy_link = %s, responsible_person = %s, size = %s,
current_name = %s, status = %s, type_ = %s, incorporated = %s;
""", (
company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated,
name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated
))
# sql = """INSERT INTO employer (company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated)
# VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');""" % (
# company_number, name, address, postcode, policy_link, responsible_person, size, current_name, status, type_, incorporated
# )
# print(sql)
self.append_employer_sics(company_number, sics)
self.__connection.commit()
def append_pay_info(self, company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent, \
diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile, \
female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile, \
female_upper_middle_quartile, male_top_quartile, female_top_quartile):
try:
float(diff_mean_hourly_percent)
except ValueError:
diff_mean_hourly_percent = None
with self.__connection.cursor() as cursor:
cursor.execute("DELETE FROM pay WHERE company_number = %s AND source = %s;", (company_number, source))
try:
cursor.execute("""
INSERT INTO pay (company_number, source, date_submitted, DiffMeanHourlyPercent, DiffMedianHourlyPercent,
DiffMeanBonusPercent, DiffMedianBonusPercent, MaleBonusPercent, FemaleBonusPercent, MaleLowerQuartile,
FemaleLowerQuartile, MaleLowerMiddleQuartile, FemaleLowerMiddleQuartile, MaleUpperMiddleQuartile,
FemaleUpperMiddleQuartile, MaleTopQuartile, FemaleTopQuartile) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
);
""", (
company_number, source, date_submitted, diff_mean_hourly_percent, diff_median_hourly_percent,
diff_mean_bonus_percent, diff_median_bonus_percent, male_bonus_percent, female_bonus_percent, male_lower_quartile,
female_lower_quartile, male_lower_middle_quartile, female_lower_middle_quartile, male_upper_middle_quartile,
female_upper_middle_quartile, male_top_quartile, female_top_quartile
))
except pymysql.err.DataError:
return
self.__connection.commit()
def append_employer_sics(self, company_number, sics):
with self.__connection.cursor() as cursor:
cursor.execute("DELETE FROM employer_sic WHERE company_number = %s", (company_number, ))
for sic in sics:
cursor.execute("SELECT * FROM sic WHERE sic_code = %s", (sic, ))
if cursor.fetchone() != None:
cursor.execute("INSERT INTO employer_sic VALUES (%s, %s);", (company_number, sic))
def search_company(self, company_prefix):
with self.__connection.cursor() as cursor:
cursor.execute("""
SELECT name, company_number FROM employer
WHERE name LIKE '%s' OR current_name LIKE '%s';
""" % (
self._wrap_percent(company_prefix),
self._wrap_percent(company_prefix)
))
return [(i[0].title(), i[1]) for i in cursor.fetchall()]
def get_company_types(self):
with self.__connection.cursor() as cursor:
cursor.execute("SELECT DISTINCT type_ FROM employer WHERE type_ IS NOT NULL;")
return [i[0] for i in cursor.fetchall()]
def get_company_sizes(self):
return [
"Not Provided",
"Less than 250",
"250 to 499",
"500 to 999",
"1000 to 4999",
"5000 to 19,999",
"20,000 or more"
]
def get_sic_sections(self):
with self.__connection.cursor() as cursor:
cursor.execute("SELECT sic_section_name FROM sic_sections")
return [i[0] for i in cursor.fetchall()]
def _source_name_to_year(self, source):
return os.path.splitext(source)[0].split("-")[-1].strip().replace("to", "-")
def get_pay_by_year(self, pay_type, sic_section_name = None, employer_type = None, employer_size = None):
sql = "SELECT source, -AVG("
if pay_type.lower() == "hourly":
sql += "DiffMedianHourlyPercent"
elif pay_type.lower() == "bonuses":
sql += "DiffMedianBonusPercent"
sql += ") FROM pay"
subqueries = []
args = []
if sic_section_name is not None:
subqueries.append("""
company_number IN (
SELECT DISTINCT company_number FROM employer_sic WHERE sic_code IN (
SELECT DISTINCT sic_code FROM sic WHERE sic_section = (
SELECT sic_section_id FROM sic_sections WHERE sic_section_name = %s
)
)
)""")
args.append(sic_section_name)
if employer_type is not None:
subqueries.append("""
company_number IN (
SELECT company_number FROM employer WHERE type_ = %s
)
""")
args.append(employer_type)
if employer_size is not None:
subqueries.append("""
company_number IN (
SELECT company_number FROM employer WHERE size = %s
)
""")
args.append(employer_size)
with self.__connection.cursor() as cursor:
if sic_section_name is not None or employer_type is not None or employer_size is not None:
sql += " WHERE {}".format(" OR ".join(subqueries))
sql += " GROUP BY source ORDER BY source;"
cursor.execute(sql, tuple(args))
else:
sql += " GROUP BY source ORDER BY source;"
cursor.execute(sql)
# print(sql)
# print(tuple(args))
return [(self._source_name_to_year(i[0]), float(i[1])) for i in cursor.fetchall()]
def get_years(self):
with self.__connection.cursor() as cursor:
cursor.execute("SELECT DISTINCT source FROM pay;")
return [self._source_name_to_year(i[0]) for i in cursor.fetchall()]
def get_pay_by_sic_section(self, pay_type, year = None):
pay = []
for section_name in self.get_sic_sections():
sql = "SELECT -AVG("
if pay_type.lower() == "hourly":
sql += "DiffMedianHourlyPercent"
elif pay_type.lower() == "bonuses":
sql += "DiffMedianBonusPercent"
sql += """
) FROM pay WHERE company_number IN (
SELECT DISTINCT company_number FROM employer_sic WHERE sic_code IN (
SELECT DISTINCT sic_code FROM sic WHERE sic_section = (
SELECT sic_section_id FROM sic_sections WHERE sic_section_name = %s
)
)
)
"""
if year is not None:
sql += " AND source LIKE %s"
sql += ";"
with self.__connection.cursor() as cursor:
# print(sql, (section_name, "%" + year.replace("to", "-") + "%"))
if year is None:
cursor.execute(sql, (section_name, ))
else:
cursor.execute(sql, (section_name, "%" + year.replace("-", "to") + "%"))
f = cursor.fetchone()[0]
if f is not None:
pay.append((section_name, float(f)))
return sorted(pay, key = operator.itemgetter(1), reverse = True)
if __name__ == "__main__":
if not os.path.exists(".docker"):
import dotenv
dotenv.load_dotenv(dotenv_path = "db.env")
host = "srv.home"
else:
host = "db"
with PayGapDatabase(host = host) as db:
print(db.get_years())
print(db.get_pay_by_sic_section("bonuses", None))
|