summaryrefslogtreecommitdiffstats
path: root/Smarker
diff options
context:
space:
mode:
authorjwansek <eddie.atten.ea29@gmail.com>2022-05-01 16:03:24 +0100
committerjwansek <eddie.atten.ea29@gmail.com>2022-05-01 16:03:24 +0100
commitabc7f067ff20bc2bd07d9236c30055549481547c (patch)
treed486cb4efda107633bcf243e5f60fdebd7094e5f /Smarker
parentee1b57ec6197c554f3c011f9a648e2222d845994 (diff)
downloadSmarker-abc7f067ff20bc2bd07d9236c30055549481547c.tar.gz
Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.zip
Finished plagarism detector, added docs
Diffstat (limited to 'Smarker')
-rw-r--r--Smarker/assessments.py101
-rw-r--r--Smarker/requirements.txt3
2 files changed, 85 insertions, 19 deletions
diff --git a/Smarker/assessments.py b/Smarker/assessments.py
index b8eb6f0..cdcdcad 100644
--- a/Smarker/assessments.py
+++ b/Smarker/assessments.py
@@ -1,8 +1,12 @@
from dataclasses import dataclass
+from matplotlib import pyplot as plt
+import numpy as np
import misc_classes
import configparser
import jinja_helpers
import pycode_similar
+import pandas as pd
+import pickle
import subprocess
import operator
import database
@@ -15,6 +19,11 @@ import re
@dataclass
class SimilarityMetric:
+ """Abstract class for getting a metric of similariry between two python objects.
+ By default it uses pycode_similar as a metric, but this can be changed by overriding
+ ``get_similarity()``. There is also the additional attribute ``details`` for getting
+ a breakdown of similarity.
+ """
code_text_1:str
code_text_2:str
id_1:int
@@ -37,10 +46,60 @@ class SimilarityMetric:
self.details += line.decode()
def get_similarity(self):
+ """Gets the similarity between the two codes.
+
+ Returns:
+ float: A percentage similarity metric
+ """
return float(re.findall(r"\d+\.\d+\s", self.details)[0])
+def visualise_matrix(dataframe:pd.DataFrame, file_name):
+ """Visualize and draw a similarity matrix. Simply shows the figure,
+ therefore this doesn't work in docker.
+
+ Args:
+ dataframe (pandas.DataFrame): Pandas dataframe representing the similarity
+ file_name (str): The file name that corrisponds to the dataframe. Used as the title
+ """
+ print(file_name)
+ print(dataframe)
+
+ values = dataframe.values
+
+ fig, ax = plt.subplots()
+ ax.matshow(values, alpha = 0.3, cmap = plt.cm.Reds)
+
+ # axes labels
+ xaxis = np.arange(len(dataframe.columns))
+ ax.set_xticks(xaxis)
+ ax.set_yticks(xaxis)
+ ax.set_xticklabels(dataframe.columns)
+ ax.set_yticklabels(dataframe.index)
+
+ # labelling each point
+ for i in range(values.shape[0]):
+ for j in range(values.shape[1]):
+ if i == j:
+ ax.text(x = j, y = i, s = "N/A", va = 'center', ha = 'center')
+ else:
+ ax.text(x = j, y = i, s = values[i, j], va = 'center', ha = 'center')
+
+ plt.title(file_name)
+ plt.show()
+
-def generate_plagarism_report(assessment_name, db):
+def generate_plagarism_report(assessment_name, db:database.SmarkerDatabase):
+ """Generates a plagarism report for the given ``assessment_name``. Only
+ fetches submissions with present files and without any exceptions.
+
+ Args:
+ assessment_name (str): The name of the assessment to fetch submissions from
+ db (database.SmarkerDatabase): An open database object is required
+
+ Returns:
+ dict: dict of ``pandas.core.frame.DataFrame`` objects indexed by the required file name
+ """
+ # get submissions with files and no exception
required_files = db.get_assessments_required_files(assessment_name)
submission_ids_to_get = set()
assessments = db.get_submissions(assessment_name)
@@ -56,27 +115,31 @@ def generate_plagarism_report(assessment_name, db):
un_added_student_nos.remove(id_)
+ # get similarity matrix
+ report = {}
codes = db.get_submission_codes(submission_ids_to_get)
for file_name, submissions in codes.items():
+ d = {}
+ d_details = {}
with tempfile.TemporaryDirectory() as td:
- print(file_name, len(submissions))
for student_id, code in submissions:
- with open(os.path.join(td, "%i.py" % student_id), "w") as f:
- f.write(code)
-
- cmd = ["pycode_similar"] + [os.path.join(td, f) for f in os.listdir(td)]
- print(" ".join(cmd))
- proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
- stdout = ""
- while True:
- line = proc.stdout.readline()
- if not line:
- break
- stdout += line.decode()
-
- print(stdout)
- input("skfhsk")
-
+ d[student_id] = []
+ d_details[student_id] = []
+ for student_id_2, code_2 in submissions:
+ sm = SimilarityMetric(code, code_2, student_id, student_id_2)
+ # print("%i and %i = %.3f" % (student_id, student_id_2, SimilarityMetric(code, code_2, student_id, student_id_2).get_similarity()))
+ d[student_id].append(sm.get_similarity())
+ d_details[student_id].append(sm)
+ index = [i[0] for i in submissions]
+ visualise_matrix(pd.DataFrame(d, index = index), file_name)
+ report[file_name] = pd.DataFrame(d_details, index = index)
+
+ out_path = os.path.realpath("plagarism_report_details.pickle")
+ with open(out_path, "wb") as f:
+ pickle.dump(report, f)
+ print("Written report to %s" % out_path)
+
+ return report
def getparser():
config = configparser.ConfigParser()
@@ -116,7 +179,7 @@ def getparser():
"-s", "--create_student",
action = misc_classes.EnvDefault,
envvar = "create_student",
- help = "Add a student in the form e.g. 123456789,Eden,Attenborough,E.Attenborough@uea.ac.uk",
+ help = "Add a student in the form e.g. 123456789,Eden Attenborough,E.Attenborough@uea.ac.uk",
required = False
)
parser.add_argument(
diff --git a/Smarker/requirements.txt b/Smarker/requirements.txt
index a8fef17..3be9c36 100644
--- a/Smarker/requirements.txt
+++ b/Smarker/requirements.txt
@@ -10,3 +10,6 @@ pdfkit
lxml
pymysql
pycode_similar
+pandas
+matplotlib
+numpy