From 1f5dec8047af8c58ce3acb5014d82caf7e6766df Mon Sep 17 00:00:00 2001 From: jwansek Date: Fri, 26 Nov 2021 17:57:07 +0000 Subject: split large texts up into more managable chunks --- database.py | 10 ++++++++-- terms.py | 15 ++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/database.py b/database.py index 5c326b4..8fc3584 100644 --- a/database.py +++ b/database.py @@ -84,6 +84,11 @@ class Database: cursor.execute("SELECT COUNT(*) FROM documents;") return cursor.fetchone()[0] + def get_max_linked_terms(self): + with self.__connection.cursor(factory = DatabaseCursor) as cursor: + cursor.execute("SELECT MAX(`document_id`) + 2 FROM term_weights;") + return cursor.fetchone()[0] + def append_terms(self, terms): with self.__connection.cursor(factory = DatabaseCursor) as cursor: cursor.executemany("INSERT OR IGNORE INTO vocabulary(term) VALUES (?);", [(term, ) for term in terms]) @@ -211,5 +216,6 @@ if __name__ == "__main__": # print(db.test_log(100)) # print(db.test_log(21)) # db.get_tf_idf_table() - for i, v in db.get_tf_idf_score("enzyme", 1).items(): - print(i, v) \ No newline at end of file + #for i, v in db.get_tf_idf_score("enzyme", 1).items(): + # print(i, v) + print(db.get_max_linked_terms()) diff --git a/terms.py b/terms.py index 74eafd9..07055dd 100644 --- a/terms.py +++ b/terms.py @@ -30,15 +30,18 @@ LEM = WordNetLemmatizer() def main(): numdocs = documents.get_num_documents() + with database.Database() as db: + startat = db.get_max_linked_terms() - 1 #docid = random.randint(1, numdocs) #parse_document(docid, documents.get_document_name_by_id(docid), numdocs) - for document_id in range(1, numdocs): + for document_id in range(startat, numdocs): parse_document(document_id, documents.get_document_name_by_id(document_id), numdocs) #break def parse_region(raw_text, region_weight, document_id): + print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text))) terms = word_tokenize(raw_text) terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms] terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] @@ -85,10 +88,12 @@ def parse_document(document_id, document_path, numdocs): weighted_linked_terms += region_linked_terms # parse the main text, it has a weight of 1 - text = [e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)] - region_weighted_terms, region_linked_terms = parse_region(" ".join(text), 1, document_id) - weighted_terms += region_weighted_terms - weighted_linked_terms += region_linked_terms + text = " ".join([e.text for e in soup.find("div", {"class": "mw-parser-output"}).findChildren(recursive = True)]) + # split large texts into more manageable chunks + for splittext in [text[i:i+99999] for i in range(0, len(text), 99999)]: + region_weighted_terms, region_linked_terms = parse_region(splittext, 1, document_id) + weighted_terms += region_weighted_terms + weighted_linked_terms += region_linked_terms # parse html headers elemtexts = [] -- cgit v1.2.3