diff options
author | jwansek <eddie.atten.ea29@gmail.com> | 2022-05-01 16:03:24 +0100 |
---|---|---|
committer | jwansek <eddie.atten.ea29@gmail.com> | 2022-05-01 16:03:24 +0100 |
commit | abc7f067ff20bc2bd07d9236c30055549481547c (patch) | |
tree | d486cb4efda107633bcf243e5f60fdebd7094e5f /Smarker | |
parent | ee1b57ec6197c554f3c011f9a648e2222d845994 (diff) | |
download | Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.tar.gz Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.zip |
Finished plagarism detector, added docs
Diffstat (limited to 'Smarker')
-rw-r--r-- | Smarker/assessments.py | 101 | ||||
-rw-r--r-- | Smarker/requirements.txt | 3 |
2 files changed, 85 insertions, 19 deletions
diff --git a/Smarker/assessments.py b/Smarker/assessments.py index b8eb6f0..cdcdcad 100644 --- a/Smarker/assessments.py +++ b/Smarker/assessments.py @@ -1,8 +1,12 @@ from dataclasses import dataclass +from matplotlib import pyplot as plt +import numpy as np import misc_classes import configparser import jinja_helpers import pycode_similar +import pandas as pd +import pickle import subprocess import operator import database @@ -15,6 +19,11 @@ import re @dataclass class SimilarityMetric: + """Abstract class for getting a metric of similariry between two python objects. + By default it uses pycode_similar as a metric, but this can be changed by overriding + ``get_similarity()``. There is also the additional attribute ``details`` for getting + a breakdown of similarity. + """ code_text_1:str code_text_2:str id_1:int @@ -37,10 +46,60 @@ class SimilarityMetric: self.details += line.decode() def get_similarity(self): + """Gets the similarity between the two codes. + + Returns: + float: A percentage similarity metric + """ return float(re.findall(r"\d+\.\d+\s", self.details)[0]) +def visualise_matrix(dataframe:pd.DataFrame, file_name): + """Visualize and draw a similarity matrix. Simply shows the figure, + therefore this doesn't work in docker. + + Args: + dataframe (pandas.DataFrame): Pandas dataframe representing the similarity + file_name (str): The file name that corrisponds to the dataframe. Used as the title + """ + print(file_name) + print(dataframe) + + values = dataframe.values + + fig, ax = plt.subplots() + ax.matshow(values, alpha = 0.3, cmap = plt.cm.Reds) + + # axes labels + xaxis = np.arange(len(dataframe.columns)) + ax.set_xticks(xaxis) + ax.set_yticks(xaxis) + ax.set_xticklabels(dataframe.columns) + ax.set_yticklabels(dataframe.index) + + # labelling each point + for i in range(values.shape[0]): + for j in range(values.shape[1]): + if i == j: + ax.text(x = j, y = i, s = "N/A", va = 'center', ha = 'center') + else: + ax.text(x = j, y = i, s = values[i, j], va = 'center', ha = 'center') + + plt.title(file_name) + plt.show() + -def generate_plagarism_report(assessment_name, db): +def generate_plagarism_report(assessment_name, db:database.SmarkerDatabase): + """Generates a plagarism report for the given ``assessment_name``. Only + fetches submissions with present files and without any exceptions. + + Args: + assessment_name (str): The name of the assessment to fetch submissions from + db (database.SmarkerDatabase): An open database object is required + + Returns: + dict: dict of ``pandas.core.frame.DataFrame`` objects indexed by the required file name + """ + # get submissions with files and no exception required_files = db.get_assessments_required_files(assessment_name) submission_ids_to_get = set() assessments = db.get_submissions(assessment_name) @@ -56,27 +115,31 @@ def generate_plagarism_report(assessment_name, db): un_added_student_nos.remove(id_) + # get similarity matrix + report = {} codes = db.get_submission_codes(submission_ids_to_get) for file_name, submissions in codes.items(): + d = {} + d_details = {} with tempfile.TemporaryDirectory() as td: - print(file_name, len(submissions)) for student_id, code in submissions: - with open(os.path.join(td, "%i.py" % student_id), "w") as f: - f.write(code) - - cmd = ["pycode_similar"] + [os.path.join(td, f) for f in os.listdir(td)] - print(" ".join(cmd)) - proc = subprocess.Popen(cmd, stdout = subprocess.PIPE) - stdout = "" - while True: - line = proc.stdout.readline() - if not line: - break - stdout += line.decode() - - print(stdout) - input("skfhsk") - + d[student_id] = [] + d_details[student_id] = [] + for student_id_2, code_2 in submissions: + sm = SimilarityMetric(code, code_2, student_id, student_id_2) + # print("%i and %i = %.3f" % (student_id, student_id_2, SimilarityMetric(code, code_2, student_id, student_id_2).get_similarity())) + d[student_id].append(sm.get_similarity()) + d_details[student_id].append(sm) + index = [i[0] for i in submissions] + visualise_matrix(pd.DataFrame(d, index = index), file_name) + report[file_name] = pd.DataFrame(d_details, index = index) + + out_path = os.path.realpath("plagarism_report_details.pickle") + with open(out_path, "wb") as f: + pickle.dump(report, f) + print("Written report to %s" % out_path) + + return report def getparser(): config = configparser.ConfigParser() @@ -116,7 +179,7 @@ def getparser(): "-s", "--create_student", action = misc_classes.EnvDefault, envvar = "create_student", - help = "Add a student in the form e.g. 123456789,Eden,Attenborough,E.Attenborough@uea.ac.uk", + help = "Add a student in the form e.g. 123456789,Eden Attenborough,E.Attenborough@uea.ac.uk", required = False ) parser.add_argument( diff --git a/Smarker/requirements.txt b/Smarker/requirements.txt index a8fef17..3be9c36 100644 --- a/Smarker/requirements.txt +++ b/Smarker/requirements.txt @@ -10,3 +10,6 @@ pdfkit lxml
pymysql
pycode_similar
+pandas
+matplotlib
+numpy
|