terms.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk import pos_tag  
import collections
import itertools  
import documents
import database
import random
import nltk
import time
import bs4
import os
import re

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 100000000

STOPWORDS = set(stopwords.words('english')).difference({
    "how", "because", "through", "or", "as", "about", "not",
    "no", "who", "of", "can", "over", "you"
}).union({chr(i) for i in range(98, 123)})
STOPWORDS.remove("a")
LEM = WordNetLemmatizer()

def main():
    numdocs = documents.get_num_documents()
    with database.Database() as db:
        startat = db.get_max_linked_terms() - 1
    #docid = random.randint(1, numdocs)
    #parse_document(docid, documents.get_document_name_by_id(docid), numdocs)

    for document_id in range(startat, numdocs):
        parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs)

        #break

def parse_region(raw_text, region_weight, document_id):
    print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text)))
    terms = word_tokenize(raw_text)
    terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms]
    terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS]

    processed = nlp(raw_text)
    linked_words = []
    for ent in processed.ents:
        words = [
            re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() 
            for word in word_tokenize(ent.text) 
            if re.sub(r"[^a-zA-Z0-9\s]", "", word).rstrip().lower() != ""
        ]
        if len(words) > 1:
            linked_words.append(words)

    return append_region(terms, linked_words, region_weight, document_id)

def append_region(terms, linked_words, region_weight, document_id):
    flattened_linked_words = set(itertools.chain.from_iterable(linked_words))
    with database.Database() as db:
        db.append_terms(flattened_linked_words.union(set(terms)))
        ids = db.get_vocabulary_ids(flattened_linked_words)
        
        linked_words_ids = [str([ids[j] for j in i])[1:-1].replace(" ", "") for i in linked_words]
        db.append_merged_terms(linked_words_ids)

    weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()}
    weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()}

    return weighted_terms, weighted_linked_terms

def parse_document(document_id, document_path, numdocs):
    starttime = time.time()
    with open(document_path, "r") as f:
       soup = bs4.BeautifulSoup(f.read(), "lxml")

    weighted_terms = collections.Counter()
    weighted_linked_terms = collections.Counter()

    # parse the file name, it has a weight of 100
    filenametext = os.path.splitext(os.path.split(document_path)[-1])[0]
    region_weighted_terms, region_linked_terms = parse_region(filenametext, 100, document_id)
    weighted_terms += region_weighted_terms
    weighted_linked_terms += region_linked_terms

    # parse the main text, it has a weight of 1
    text = " ".join([e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)])
    # split large texts into more manageable chunks
    for splittext in [text[i:i+99999] for i in range(0, len(text), 99999)]:
        region_weighted_terms, region_linked_terms = parse_region(splittext, 1, document_id)
        weighted_terms += region_weighted_terms
        weighted_linked_terms += region_linked_terms

    # parse html headers
    elemtexts = []
    try:
        elemtexts += [e.text for e in soup.h1.findChildren(recursive = True)]
    except AttributeError:
        pass
    
    try:
        elemtexts += [e.text for e in soup.h2.findChildren(recursive = True)]
    except AttributeError:
        pass

    region_weighted_terms, region_linked_terms = parse_region(re.sub(r"edit|Contents|source", "", " ".join(elemtexts)), 50, document_id)
    weighted_terms += region_weighted_terms
    weighted_linked_terms += region_linked_terms

    # parse html link elements texts, has a weight of 10
    a_texts = [e.text for e in soup.select("a") if e.text != "" and e.text != "edit" and e.text != "edit source"]
    region_weighted_terms, region_linked_terms = parse_region(" ".join(a_texts), 10, document_id)
    weighted_terms += region_weighted_terms
    weighted_linked_terms += region_linked_terms

    with database.Database() as db:
        db.append_document_term_weights(weighted_terms, document_id)
        db.append_document_linked_term_weights(weighted_linked_terms, document_id)

    print("[%.3f%%] {%.1fs} %d terms (weight %d), %d linked terms (weight %d) - %s" % (
        (document_id/numdocs)*100,
        time.time() - starttime, 
        len(weighted_terms), 
        sum(weighted_terms.values()), 
        len(weighted_linked_terms), 
        sum(weighted_linked_terms.values()),
        document_path
    ))


if __name__ == "__main__":
    main()