19 files changed, 478 insertions, 22 deletions
diff --git a/.gitignore b/.gitignore
index 78a3197..db2ce2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ out/
 *.zip
 smarker.conf
 *.aux
+*.pickle
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index 2914835..3b61499 120000
--- a/README.md
+++ b/README.md
@@ -32,3 +32,7 @@ File with an exception
 ![pytest](https://smarker.eda.gay/_static/readme_pytest.png)
 
 Using pytest
+
+![matrix](https://smarker.eda.gay/_static/readme_matrix.png)
+
+Plagarism and collusion detection matrix
diff --git a/Smarker/assessments.py b/Smarker/assessments.py
index b8eb6f0..cdcdcad 100644
--- a/Smarker/assessments.py
+++ b/Smarker/assessments.py
@@ -1,8 +1,12 @@
 from dataclasses import dataclass
+from matplotlib import pyplot as plt
+import numpy as np
 import misc_classes
 import configparser
 import jinja_helpers
 import pycode_similar
+import pandas as pd
+import pickle
 import subprocess
 import operator
 import database
@@ -15,6 +19,11 @@ import re
 
 @dataclass
 class SimilarityMetric:
+    """Abstract class for getting a metric of similariry between two python objects.
+    By default it uses pycode_similar as a metric, but this can be changed by overriding
+    ``get_similarity()``. There is also the additional attribute ``details`` for getting
+    a breakdown of similarity.
+    """
     code_text_1:str
     code_text_2:str
     id_1:int
@@ -37,10 +46,60 @@ class SimilarityMetric:
                 self.details += line.decode()
 
     def get_similarity(self):
+        """Gets the similarity between the two codes.
+
+        Returns:
+            float: A percentage similarity metric
+        """
         return float(re.findall(r"\d+\.\d+\s", self.details)[0])
 
+def visualise_matrix(dataframe:pd.DataFrame, file_name):
+    """Visualize and draw a similarity matrix. Simply shows the figure,
+    therefore this doesn't work in docker.
+
+    Args:
+        dataframe (pandas.DataFrame): Pandas dataframe representing the similarity
+        file_name (str): The file name that corrisponds to the dataframe. Used as the title
+    """
+    print(file_name)
+    print(dataframe)
+
+    values = dataframe.values
+
+    fig, ax = plt.subplots()
+    ax.matshow(values, alpha = 0.3, cmap = plt.cm.Reds)
+
+    # axes labels
+    xaxis = np.arange(len(dataframe.columns))
+    ax.set_xticks(xaxis)
+    ax.set_yticks(xaxis)
+    ax.set_xticklabels(dataframe.columns)
+    ax.set_yticklabels(dataframe.index)
+
+    # labelling each point
+    for i in range(values.shape[0]):
+        for j in range(values.shape[1]):
+            if i == j:
+                ax.text(x = j, y = i, s = "N/A", va = 'center', ha = 'center')
+            else:
+                ax.text(x = j, y = i, s = values[i, j], va = 'center', ha = 'center')
+    
+    plt.title(file_name)
+    plt.show()
+
 
-def generate_plagarism_report(assessment_name, db):
+def generate_plagarism_report(assessment_name, db:database.SmarkerDatabase):
+    """Generates a plagarism report for the given ``assessment_name``. Only
+    fetches submissions with present files and without any exceptions.
+
+    Args:
+        assessment_name (str): The name of the assessment to fetch submissions from
+        db (database.SmarkerDatabase): An open database object is required
+
+    Returns:
+        dict: dict of ``pandas.core.frame.DataFrame`` objects indexed by the required file name
+    """
+    # get submissions with files and no exception
     required_files = db.get_assessments_required_files(assessment_name)
     submission_ids_to_get = set()
     assessments = db.get_submissions(assessment_name)
@@ -56,27 +115,31 @@ def generate_plagarism_report(assessment_name, db):
 
             un_added_student_nos.remove(id_)
     
+    # get similarity matrix
+    report = {}
     codes = db.get_submission_codes(submission_ids_to_get)
     for file_name, submissions in codes.items():
+        d = {}
+        d_details = {}
         with tempfile.TemporaryDirectory() as td:
-            print(file_name, len(submissions))
             for student_id, code in submissions:
-                with open(os.path.join(td, "%i.py" % student_id), "w") as f:
-                    f.write(code)
-
-            cmd = ["pycode_similar"] + [os.path.join(td, f) for f in os.listdir(td)]
-            print(" ".join(cmd))
-            proc = subprocess.Popen(cmd, stdout = subprocess.PIPE)
-            stdout = ""
-            while True:
-                line = proc.stdout.readline()
-                if not line:
-                    break
-                stdout += line.decode()
-
-            print(stdout)
-            input("skfhsk")
-                    
+                d[student_id] = []
+                d_details[student_id] = []
+                for student_id_2, code_2 in submissions:
+                    sm = SimilarityMetric(code, code_2, student_id, student_id_2)
+                    # print("%i and %i = %.3f" % (student_id, student_id_2, SimilarityMetric(code, code_2, student_id, student_id_2).get_similarity()))
+                    d[student_id].append(sm.get_similarity())
+                    d_details[student_id].append(sm)
+        index = [i[0] for i in submissions]
+        visualise_matrix(pd.DataFrame(d, index = index), file_name)
+        report[file_name] = pd.DataFrame(d_details, index = index)
+
+    out_path = os.path.realpath("plagarism_report_details.pickle")
+    with open(out_path, "wb") as f:
+        pickle.dump(report, f)
+    print("Written report to %s" % out_path)
+
+    return report
 
 def getparser():
     config = configparser.ConfigParser()
@@ -116,7 +179,7 @@ def getparser():
         "-s", "--create_student",
         action = misc_classes.EnvDefault,
         envvar = "create_student",
-        help = "Add a student in the form e.g. 123456789,Eden,Attenborough,E.Attenborough@uea.ac.uk",
+        help = "Add a student in the form e.g. 123456789,Eden Attenborough,E.Attenborough@uea.ac.uk",
         required = False
     )
     parser.add_argument(
diff --git a/Smarker/requirements.txt b/Smarker/requirements.txt
index a8fef17..3be9c36 100644
--- a/Smarker/requirements.txt
+++ b/Smarker/requirements.txt
@@ -10,3 +10,6 @@ pdfkit
 lxml
 pymysql
 pycode_similar
+pandas
+matplotlib
+numpy
diff --git a/docs/source/_static/QuickStart/simple_assessment.yml b/docs/source/_static/QuickStart/simple_assessment.yml
new file mode 100644
index 0000000..414f00b
--- /dev/null
+++ b/docs/source/_static/QuickStart/simple_assessment.yml
@@ -0,0 +1,12 @@
+name: simple_assessment
+files:
+    - euclid.py:
+        functions:
+            - gcd(2)
+        tests:
+            - |
+                assert euclid.gcd(8,12) == 4
+        run:
+            - python euclid.py:
+                regexes:
+                    - ^4
diff --git a/docs/source/_static/QuickStart/simple_submission_1/euclid.py b/docs/source/_static/QuickStart/simple_submission_1/euclid.py
new file mode 100644
index 0000000..f72707a
--- /dev/null
+++ b/docs/source/_static/QuickStart/simple_submission_1/euclid.py
@@ -0,0 +1,22 @@
+# the newest!
+# assessment 1
+
+def gcd(m,n) -> int:
+    """Calculates the greatest common denominator between two numbers.
+
+    Args:
+        x (int): Number One
+        y (int): Number Two
+
+    Returns:
+        int: The GCD of the two numbers
+    """
+    if m< n:
+        (m,n) = (n,m)
+    if(m%n) == 0:
+        return n
+    else:
+        return (gcd(n, m % n)) # recursion taking place
+
+# gcd
+print(gcd(8,12))
diff --git a/docs/source/_static/QuickStart/simple_submission_2/euclid.py b/docs/source/_static/QuickStart/simple_submission_2/euclid.py
new file mode 100644
index 0000000..0819bc5
--- /dev/null
+++ b/docs/source/_static/QuickStart/simple_submission_2/euclid.py
@@ -0,0 +1,10 @@
+def gcd(m,n):
+    if m< n:
+        (m,n) = (n,m)
+    if(m%n) == 0:
+        return n
+    else:
+        return (gcd(n, m % n)) # recursion taking place
+
+# calling function with parameters and printing it out
+print(gcd(8,12))
diff --git a/docs/source/_static/QuickStart/simple_submission_3/euclid.py b/docs/source/_static/QuickStart/simple_submission_3/euclid.py
new file mode 100644
index 0000000..73e7d9c
--- /dev/null
+++ b/docs/source/_static/QuickStart/simple_submission_3/euclid.py
@@ -0,0 +1,11 @@
+def gcd(p,q):
+    """Docstring gcd"""
+    if p < q:
+        (p,q) = (q,p)
+    if(p%q) == 0:
+        return q
+    else:
+        return (gcd(q, p % q)) # recursion taking place
+
+# calling function with parameters and printing it out
+print(gcd(8,12))
diff --git a/docs/source/_static/QuickStart/simple_submission_4/euclid.py b/docs/source/_static/QuickStart/simple_submission_4/euclid.py
new file mode 100644
index 0000000..064d1e5
--- /dev/null
+++ b/docs/source/_static/QuickStart/simple_submission_4/euclid.py
@@ -0,0 +1,16 @@
+# assessment A
+# student id: 4
+
+def gcd(x,y):
+    if x > y:
+        small = y
+    else:
+        small = x
+    for i in range(1, small+1):
+        if((x % i == 0) and (y % i == 0)):
+            g = i
+              
+    return g
+
+# calling function with parameters and printing it out
+print(gcd(8,12))
diff --git a/docs/source/_static/readme_matrix.png b/docs/source/_static/readme_matrix.png
new file mode 100644
index 0000000..e91358b
--- /dev/null
+++ b/docs/source/_static/readme_matrix.png
diff --git a/docs/source/_static/report.txt b/docs/source/_static/report.txt
new file mode 100644
index 0000000..b78e20b
--- /dev/null
+++ b/docs/source/_static/report.txt
@@ -0,0 +1,9 @@
+euclid.py
+        2  ...       1
+2  100.00  ...   94.74
+3  100.00  ...   94.74
+4   63.16  ...   57.89
+1   94.74  ...  100.00
+
+[4 rows x 4 columns]
+Written report to /Smarker/plagarism_report_details.pickle
diff --git a/docs/source/_static/simple.json b/docs/source/_static/simple.json
new file mode 100644
index 0000000..40accc7
--- /dev/null
+++ b/docs/source/_static/simple.json
@@ -0,0 +1,60 @@
+{
+    "files": [
+        {
+            "euclid.py": {
+                "functions": [
+                    {
+                        "gcd(2)": {
+                            "present": true,
+                            "documentation": {
+                                "comments": "None",
+                                "doc": "Docstring gcd"
+                            },
+                            "arguments": "(p, q)",
+                            "minimum_arguments": 2,
+                            "source_code": "def gcd(p,q):\n    \"\"\"Docstring gcd\"\"\"\n    if p < q:\n        (p,q) = (q,p)\n    if(p%q) == 0:\n        return q\n    else:\n        return (gcd(q, p % q)) # recursion taking place"
+                        }
+                    }
+                ],
+                "run": [
+                    {
+                        "python euclid.py": {
+                            "regexes": {
+                                "^4": [
+                                    "4"
+                                ]
+                            },
+                            "full_output": "4\n"
+                        }
+                    }
+                ],
+                "tests": [
+                    "assert euclid.gcd(8,12) == 4\n"
+                ],
+                "present": true,
+                "has_exception": false,
+                "documentation": {
+                    "comments": "None",
+                    "doc": "None"
+                }
+            }
+        }
+    ],
+    "name": "simple_assessment",
+    "student_no": "123456790",
+    "test_results": {
+        "pytest_report": "============================= test session starts ==============================\nplatform linux -- Python 3.10.4, pytest-7.1.1, pluggy-1.0.0 -- /usr/bin/python3\ncachedir: .pytest_cache\nrootdir: /tmp/tmpjzy020i4/simple_submission_3\ncollecting ... collected 1 item\n\n../../../../../../tmp/tmpjzy020i4/simple_submission_3/test_euclid.py::test_1 PASSED [100%]\n\n--------------- generated xml file: /tmp/tmpyu0qypji/report.xml ----------------\n============================== 1 passed in 0.01s ===============================\n",
+        "junitxml": "<?xml version=\"1.0\" encoding=\"utf-8\"?><testsuites><testsuite name=\"pytest\" errors=\"0\" failures=\"0\" skipped=\"0\" tests=\"1\" time=\"0.019\" timestamp=\"2022-05-01T15:03:57.143881\" hostname=\"thonkpad2\"><testcase classname=\"test_euclid\" name=\"test_1\" time=\"0.001\" /></testsuite></testsuites>",
+        "meta": {
+            "name": "pytest",
+            "errors": "0",
+            "failures": "0",
+            "skipped": "0",
+            "tests": "1",
+            "time": "0.019",
+            "timestamp": "2022-05-01T15:03:57.143881",
+            "hostname": "thonkpad2"
+        }
+    },
+    "class_tree": {}
+}
diff --git a/docs/source/_static/simple.txt b/docs/source/_static/simple.txt
new file mode 100644
index 0000000..b6dcc16
--- /dev/null
+++ b/docs/source/_static/simple.txt
@@ -0,0 +1,89 @@
+============================= test session starts ==============================
+platform linux -- Python 3.10.4, pytest-7.1.2, pluggy-1.0.0 -- /usr/bin/python3
+cachedir: .pytest_cache
+rootdir: /tmp/tmp398_c3x6/simple_submission_1
+collecting ... collected 1 item
+
+../tmp/tmp398_c3x6/simple_submission_1/test_euclid.py::test_1 PASSED     [100%]
+
+--------------- generated xml file: /tmp/tmpceag5_nn/report.xml ----------------
+============================== 1 passed in 0.01s ===============================
+4
+=== simple_assessment - Student ID: 1 Automatic marking report ===
+Report generated at 2022-05-01 15:49:15.701124
+
+== Class Tree: ==
+
+{}
+
+
+== File Analysis ==
+
+    = euclid.py =
+        Documentation:
+            28 characters long
+            Comments:
+                ```
+                # the newest!
+                # assessment 1
+                ```
+            Docstring:
+                *** No docstring present ***
+        Functions:
+            gcd(2):
+                Arguments:
+                    (m, n) -> int
+                    Enough? YES
+                Documentation:
+                    164 characters long
+                    Comments:
+                        *** No comments present ***
+                    Docstring:
+            ```
+            Calculates the greatest common denominator between two numbers.
+
+            Args:
+                x (int): Number One
+                y (int): Number Two
+
+            Returns:
+                int: The GCD of the two numbers
+            ```
+                Source:
+                    15 lines (356 characters)
+                    Code:
+            ```
+            def gcd(m,n) -> int:
+                """Calculates the greatest common denominator between two numbers.
+
+                Args:
+                    x (int): Number One
+                    y (int): Number Two
+
+                Returns:
+                    int: The GCD of the two numbers
+                """
+                if m< n:
+                    (m,n) = (n,m)
+                if(m%n) == 0:
+                    return n
+                else:
+                    return (gcd(n, m % n)) # recursion taking place
+            ```
+        Runtime Analysis:
+            Command `python euclid.py`:
+                Monitor:
+                    stdout
+                Regexes:
+                    `^4`:
+                        Found occurrences: 1
+                        Occurrences list:
+                            4
+                Full runtime output:
+                ```
+                    4
+    
+                ```
+
+
+
diff --git a/docs/source/assessments.rst b/docs/source/assessments.rst
new file mode 100644
index 0000000..a8d7311
--- /dev/null
+++ b/docs/source/assessments.rst
@@ -0,0 +1,24 @@
+.. _assessments:
+
+``assessments.py``
+==================
+
+``assessments.py`` contains many useful arguments for interacting with the database:
+
+.. argparse::
+   :module: assessments
+   :func: getparser
+   :prog: python Smarker/assessments.py
+
+Classes
+*******
+
+.. autoclass:: assessments.SimilarityMetric
+    :members:
+
+Functions
+*********
+
+.. autofunction:: assessments.visualise_matrix
+
+.. autofunction:: assessments.generate_plagarism_report
+\ No newline at end of file
diff --git a/docs/source/docker.rst b/docs/source/docker.rst
index 7c3237a..232c7f4 100644
--- a/docs/source/docker.rst
+++ b/docs/source/docker.rst
@@ -41,4 +41,10 @@ To list assessments in the database using docker:
 
 .. code-block:: bash
     
-    sudo docker run -it --entrypoint python --rm smarker assessments.py --list yes
-\ No newline at end of file
+    sudo docker run -it --entrypoint python --rm smarker assessments.py --list yes
+
+.. code-block:: bash
+
+    touch out/report.pickle && sudo docker run -v "$(pwd)/out/report.pickle":/Smarker/plagarism_report_details.pickle -it --entrypoint python --rm smarker assessments.py --plagarism_report example
+
+If a file doesn't exist before it's passed through as a volume in docker, it will be created automatically as a *directory*- this causes issues if the docker image produces a file so we make a blank file first.
+\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e36cc86..f2d7426 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,5 +1,7 @@
 .. mdinclude:: readme.md
 
+Read the :ref:`quickstart`.
+
 Setting up
 ----------
 
@@ -26,6 +28,8 @@ Please note that the ``-o`` flag is required for rendering to PDFs.
 
 ``assessments.py`` contains many useful arguments for interacting with the database:
 
+Also see :ref:`assessments`
+
 .. argparse::
    :module: assessments
    :func: getparser
@@ -37,11 +41,13 @@ Please note that the ``-o`` flag is required for rendering to PDFs.
    
    reflect.rst
    database.rst
+   assessments.rst
 
 .. toctree::
    :maxdepth: 2
    :caption: Other Pages:
 
+   quickstart.rst
    configfile.rst
    docker.rst
    assessmentyaml.rst
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
new file mode 100644
index 0000000..08f3cec
--- /dev/null
+++ b/docs/source/quickstart.rst
@@ -0,0 +1,97 @@
+.. _quickstart:
+
+Quick start guide
+=================
+
+This guide implements a simple assessment to make a *greatest common denominator* function.
+
+First make an assessment yaml file:
+
+.. literalinclude:: _static/QuickStart/simple_assessment.yml
+    :linenos:
+    :language: yaml
+
+This expects a single function called ``gcd()`` in a file called ``euclid.py`` with no fewer
+than two arguments. It expects it to print ``4`` to stdout when executed. It also runs pytest
+on the function.
+
+Then add it to the database:
+
+.. code-block:: bash
+
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/simple_assessment.yml":/tmp/assessment.yml -it --entrypoint python --rm smarker assessments.py -c /tmp/assessment.yml
+
+If using windows, I recommend using the mingw shell since powershell is bad at dealing with relative file paths in docker.
+
+Then add some students:
+
+.. code-block:: bash
+
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/simple_assessment.yml":/tmp/assessment.yml -it --entrypoint python --rm smarker assessments.py -s "1,Alice,a.bar@uea.ac.uk"
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/simple_assessment.yml":/tmp/assessment.yml -it --entrypoint python --rm smarker assessments.py -s "2,Bob,b.bar@uea.ac.uk"
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/simple_assessment.yml":/tmp/assessment.yml -it --entrypoint python --rm smarker assessments.py -s "3,Christina,c.bar@uea.ac.uk"
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/simple_assessment.yml":/tmp/assessment.yml -it --entrypoint python --rm smarker assessments.py -s "4,Dan,d.bar@uea.ac.uk"
+
+Now we are ready to make some reports! The submissions are zip files with the student's id as the name. First lets just use the default parameters:
+
+.. code-block:: bash
+
+    docker run -v "$(pwd)/docs/source/_static/QuickStart/1.zip":/tmp/1.zip -e submission=/tmp/1.zip -e assessment=simple_assessment --rm smarker
+
+This prints out the result as text to stdout:
+
+.. literalinclude:: _static/simple.txt
+
+Smarker can render to text, markdown, json, yaml and PDF, and produce less information, but for now we'll only use the defaults. 
+Do the same for the other three submissions.
+
+We can now generate a plagarism report. But first, lets look at the actual submitted files. Here's the submission from student 1:
+
+.. literalinclude:: _static/QuickStart/simple_submission_1/euclid.py
+    :linenos:
+    :language: python
+
+Student 2:
+
+.. literalinclude:: _static/QuickStart/simple_submission_2/euclid.py
+    :linenos:
+    :language: python
+
+Student 3:
+
+.. literalinclude:: _static/QuickStart/simple_submission_3/euclid.py
+    :linenos:
+    :language: python
+
+Student 4:
+
+.. literalinclude:: _static/QuickStart/simple_submission_4/euclid.py
+    :linenos:
+    :language: python
+
+From this we can tell that student 2 has copied from student 1 (or the other way around), changing only the header comments.
+Student 3 has also copied from student 1, but has changed the variable names in an attempt to hide it. Submission 4 is completely different.
+
+Now we can generate a plagarism report:
+
+.. code-block:: bash
+
+    touch out/report.pickle && sudo docker run -v "$(pwd)/out/report.pickle":/Smarker/plagarism_report_details.pickle -it --entrypoint python --rm smarker assessments.py --plagarism_report simple_assessment
+
+Which produces a pickled report matrix, and prints out to stdout:
+
+.. code-block:: text
+
+            2       3       4       1
+    2  100.00  100.00   42.86   94.74
+    3  100.00  100.00   42.86   94.74
+    4   63.16   63.16  100.00   57.89
+    1   94.74   94.74   39.29  100.00
+    Written report to /Smarker/plagarism_report_details.pickle
+
+If we run it outside of docker, we can also get it rendered nicely in matplotlib:
+
+.. image:: _static/readme_matrix.png
+
+The matrix isn't symmetrical, which is intentional, since it considers the difference in complexity between submissions. This can be useful for
+finding the culprit in copying.
+\ No newline at end of file
diff --git a/docs/source/readme.md b/docs/source/readme.md
index 2914835..3b61499 100644
--- a/docs/source/readme.md
+++ b/docs/source/readme.md
@@ -32,3 +32,7 @@ File with an exception
 ![pytest](https://smarker.eda.gay/_static/readme_pytest.png)
 
 Using pytest
+
+![matrix](https://smarker.eda.gay/_static/readme_matrix.png)
+
+Plagarism and collusion detection matrix
diff --git a/docs/source/reflect.rst b/docs/source/reflect.rst
index c059206..6c0767a 100644
--- a/docs/source/reflect.rst
+++ b/docs/source/reflect.rst
@@ -1,5 +1,24 @@
 ``reflect.py``: Getting information about code
 ==============================================
 
-.. automodule:: reflect
-    :members:
-\ No newline at end of file
+Classes
+*******
+
+.. autoclass:: reflect.Reflect
+    :members:
+
+.. autoexception::  reflect.MonitoredFileNotInProducedFilesException
+
+Thrown if the user has tried to monitor a file that isn't in the list of produced files in the :ref:`assessmentyaml`.
+
+Functions
+*********
+
+.. autofunction:: reflect.gen_reflection_report
+
+Generates a json file report. It is quite a complex structure, but it is made so users can add other rendering templates 
+later on. For example, the :ref:`quickstart` looks like this:
+
+.. literalinclude:: _static/simple.json
+    :linenos:
+    :language: yaml
+\ No newline at end of file