Finished plagarism detector, added docs

author: jwansek <eddie.atten.ea29@gmail.com> 2022-05-01 16:03:24 +0100
committer: jwansek <eddie.atten.ea29@gmail.com> 2022-05-01 16:03:24 +0100
commit: abc7f067ff20bc2bd07d9236c30055549481547c (patch)
tree: d486cb4efda107633bcf243e5f60fdebd7094e5f /Smarker
parent: ee1b57ec6197c554f3c011f9a648e2222d845994 (diff)
download: Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.tar.gz
Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.zip
2 files changed, 85 insertions, 19 deletions
diff --git a/Smarker/assessments.py b/Smarker/assessments.py
index b8eb6f0..cdcdcad 100644
--- a/Smarker/assessments.py
+++ b/Smarker/assessments.py
@@ -1,8 +1,12 @@
 from dataclasses import dataclass
+from matplotlib import pyplot as plt
+import numpy as np
 import misc_classes
 import configparser
 import jinja_helpers
 import pycode_similar
+import pandas as pd
+import pickle
 import subprocess
 import operator
 import database
@@ -15,6 +19,11 @@ import re
 
 @dataclass
 class SimilarityMetric:
+    """Abstract class for getting a metric of similariry between two python objects.
+    By default it uses pycode_similar as a metric, but this can be changed by overriding
+    ``get_similarity()``. There is also the additional attribute ``details`` for getting
+    a breakdown of similarity.
+    """
     code_text_1:str
     code_text_2:str
     id_1:int
@@ -37,10 +46,60 @@ class SimilarityMetric:
                 self.details += line.decode()
 
     def get_similarity(self):
+        """Gets the similarity between the two codes.
+
+        Returns:
+            float: A percentage similarity metric
+        """
         return float(re.findall(r"\d+\.\d+\s", self.details)[0])
 
+def visualise_matrix(dataframe:pd.DataFrame, file_name):
+    """Visualize and draw a similarity matrix. Simply shows the figure,
+    therefore this doesn't work in docker.
+
+    Args:
+        dataframe (pandas.DataFrame): Pandas dataframe representing the similarity
+        file_name (str): The file name that corrisponds to the dataframe. Used as the title
+    """
+    print(file_name)
+    print(dataframe)
+
+    values = dataframe.values
+
+    fig, ax = plt.subplots()
+    ax.matshow(values, alpha = 0.3, cmap = plt.cm.Reds)
+
+    # axes labels
+    xaxis = np.arange(len(dataframe.columns))
+    ax.set_xticks(xaxis)
+    ax.set_yticks(xaxis)
+    ax.set_xticklabels(dataframe.columns)
+    ax.set_yticklabels(dataframe.index)
+
+    # labelling each point
+    for i in range(values.shape[0]):
+        for j in range(values.shape[1]):
+            if i == j:
+                ax.text(x = j, y = i, s = "N/A", va = 'center', ha = 'center')
+            else:
+                ax.text(x = j, y = i, s = values[i, j], va = 'center', ha = 'center')
+    
+    plt.title(file_name)
+    plt.show()
+
 
-def generate_plagarism_report(assessment_name, db):
+def generate_plagarism_report(assessment_name, db:database.SmarkerDatabase):
+    """Generates a plagarism report for the given ``assessment_name``. Only
+    fetches submissions with present files and without any exceptions.
+
+    Args:
+        assessment_name (str): The name of the assessment to fetch submissions from
+        db (database.SmarkerDatabase): An open database object is required
+
+    Returns:
+        dict: dict of ``pandas.core.frame.DataFrame`` objects indexed by the required file name
+    """
+    # get submissions with files and no exception
     required_files = db.get_assessments_required_files(assessment_name)
     submission_ids_to_get = set()
     assessments = db.get_submissions(assessment_name)
@@ -56,27 +115,31 @@ def generate_plagarism_report(assessment_name, db):
 
             un_added_student_nos.remove(id_)
     
+    # get similarity matrix
+    report = {}
     codes = db.get_submission_codes(submission_ids_to_get)
     for file_name, submissions in codes.items():
+        d = {}
+        d_details = {}
         with tempfile.TemporaryDirectory() as td:
-            print(file_name, len(submissions))
             for student_id, code in submissions:
-                with open(os.path.join(td, "%i.py" % student_id), "w") as f:
-                    f.write(code)
-
-            cmd = ["pycode_similar"] + [os.path.join(td, f) for f in os.listdir(td)]
-            print(" ".join(cmd))
-            proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
-            stdout = ""
-            while True:
-                line = proc.stdout.readline()
-                if not line:
-                    break
-                stdout += line.decode()
-
-            print(stdout)
-            input("skfhsk")
-                    
+                d[student_id] = []
+                d_details[student_id] = []
+                for student_id_2, code_2 in submissions:
+                    sm = SimilarityMetric(code, code_2, student_id, student_id_2)
+                    # print("%i and %i = %.3f" % (student_id, student_id_2, SimilarityMetric(code, code_2, student_id, student_id_2).get_similarity()))
+                    d[student_id].append(sm.get_similarity())
+                    d_details[student_id].append(sm)
+        index = [i[0] for i in submissions]
+        visualise_matrix(pd.DataFrame(d, index = index), file_name)
+        report[file_name] = pd.DataFrame(d_details, index = index)
+
+    out_path = os.path.realpath("plagarism_report_details.pickle")
+    with open(out_path, "wb") as f:
+        pickle.dump(report, f)
+    print("Written report to %s" % out_path)
+
+    return report
 
 def getparser():
     config = configparser.ConfigParser()
@@ -116,7 +179,7 @@ def getparser():
         "-s", "--create_student",
         action = misc_classes.EnvDefault,
         envvar = "create_student",
-        help = "Add a student in the form e.g. 123456789,Eden,Attenborough,E.Attenborough@uea.ac.uk",
+        help = "Add a student in the form e.g. 123456789,Eden Attenborough,E.Attenborough@uea.ac.uk",
         required = False
     )
     parser.add_argument(
diff --git a/Smarker/requirements.txt b/Smarker/requirements.txt
index a8fef17..3be9c36 100644
--- a/Smarker/requirements.txt
+++ b/Smarker/requirements.txt
@@ -10,3 +10,6 @@ pdfkit
 lxml
 pymysql
 pycode_similar
+pandas
+matplotlib
+numpy
author	jwansek <eddie.atten.ea29@gmail.com>	2022-05-01 16:03:24 +0100
committer	jwansek <eddie.atten.ea29@gmail.com>	2022-05-01 16:03:24 +0100
commit	abc7f067ff20bc2bd07d9236c30055549481547c (patch)
tree	d486cb4efda107633bcf243e5f60fdebd7094e5f /Smarker
parent	ee1b57ec6197c554f3c011f9a648e2222d845994 (diff)
download	Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.tar.gz Smarker-abc7f067ff20bc2bd07d9236c30055549481547c.zip