diff options
-rw-r--r-- | database.py | 140 | ||||
-rw-r--r-- | search.py | 85 | ||||
-rw-r--r-- | terms.py | 3 | ||||
-rw-r--r-- | tf_idf.py | 1 |
4 files changed, 172 insertions, 57 deletions
diff --git a/database.py b/database.py index 8fc3584..4534097 100644 --- a/database.py +++ b/database.py @@ -2,6 +2,7 @@ from dataclasses import dataclass import math as maths
import sqlite3
import tf_idf
+import nltk
import os
class DatabaseCursor(sqlite3.Cursor):
@@ -102,6 +103,16 @@ class Database: out[term] = cursor.fetchone()[0]
return out
+ def get_linked_vocabulary_ids(self, linked_term):
+ ids = []
+ print(linked_term)
+ for term in linked_term:
+ out.append(str(self.get_vocabulary_ids([term])[term]))
+ print(",".join(ids))
+
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT `vocabular")
+
def append_merged_terms(self, merged_terms):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
cursor.executemany("INSERT OR IGNORE INTO `linked_terms`(`chain`) VALUES (?);", [(i, ) for i in merged_terms])
@@ -128,30 +139,53 @@ class Database: def build_tf_idf_table(self):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
- # cursor.execute("DROP VIEW IF EXISTS `tf_idf`;")
- # cursor.execute("DROP VIEW IF EXISTS `vocab_count`;")
+ cursor.execute("DROP VIEW IF EXISTS `tf_idf_linked`;")
+ cursor.execute("DROP VIEW IF EXISTS `tf_idf_singles`;")
+ cursor.execute("DROP VIEW IF EXISTS `vocab_count_singles`;")
+ cursor.execute("DROP VIEW IF EXISTS `vocab_count_linked`;")
# i wish i could find a way to do this with a single view but alas i am
# not good enough at SQL
cursor.execute("""
- CREATE VIEW IF NOT EXISTS `vocab_count` AS
+ CREATE VIEW IF NOT EXISTS `vocab_count_singles` AS
SELECT vocabulary_id,
COUNT(vocabulary_id) AS vocabulary_count
FROM term_weights
GROUP BY vocabulary_id;
""")
cursor.execute("""
- CREATE VIEW IF NOT EXISTS `tf_idf` AS SELECT
+ CREATE VIEW IF NOT EXISTS `tf_idf_singles` AS SELECT
`term_weights`.`vocabulary_id` AS `vocabulary_id`,
`document_id`,
`term_weights`.`weight`,
LOG_TF(`weight`) AS tf,
(SELECT COUNT(`document_id`) FROM `documents`) AS n,
- `vocab_count`.`vocabulary_count` AS df,
- (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS idf,
- LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count`.`vocabulary_count`) FROM documents) AS tf_idf
+ `vocab_count_singles`.`vocabulary_count` AS df,
+ (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS idf,
+ LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_singles`.`vocabulary_count`) FROM documents) AS tf_idf
+ FROM `term_weights`
+ INNER JOIN `vocab_count_singles`
+ ON `vocab_count_singles`.`vocabulary_id` = `term_weights`.`vocabulary_id`
+ ;""")
+ cursor.execute("""
+ CREATE VIEW IF NOT EXISTS `vocab_count_linked` AS
+ SELECT linked_term_id,
+ COUNT(linked_term_id) AS linked_term_count
+ FROM term_weights
+ GROUP BY linked_term_id;
+ """)
+ cursor.execute("""
+ CREATE VIEW IF NOT EXISTS `tf_idf_linked` AS SELECT
+ `term_weights`.`linked_term_id` AS `linked_term_id`,
+ `document_id`,
+ `term_weights`.`weight`,
+ LOG_TF(`weight`) AS tf,
+ (SELECT COUNT(`document_id`) FROM `documents`) AS n,
+ `vocab_count_linked`.`linked_term_count` AS df,
+ (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS idf,
+ LOG_TF(`weight`) * (SELECT LOG(CAST(COUNT(`document_id`) AS REAL) / `vocab_count_linked`.`linked_term_count`) FROM documents) AS tf_idf
FROM `term_weights`
- INNER JOIN `vocab_count`
- ON `vocab_count`.`vocabulary_id` = `term_weights`.`vocabulary_id`
+ INNER JOIN `vocab_count_linked`
+ ON `vocab_count_linked`.`linked_term_id` = `term_weights`.`linked_term_id`
;""")
def get_term_frequencies(self):
@@ -159,26 +193,6 @@ class Database: cursor.execute("SELECT * FROM `term_weights`;")
return cursor.fetchall()
- def append_tf_idf_table(self, tfs):
- # print([(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs][1])
- with self.__connection.cursor(factory = DatabaseCursor) as cursor:
- cursor.executemany("""
- INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
- VALUES (
- ?, ?, ?,
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
- """, [(i[0], i[1], i[2], i[0], i[0], i[2]) for i in tfs])
-
- def append_tf_idf_table_single(self, vocabulary_id, document_id, tf):
- with self.__connection.cursor(factory = DatabaseCursor) as cursor:
- cursor.execute("""
- INSERT INTO `tf_idf`(`vocabulary_id`, `document_id`, `tf`, `idf`, `tf_idf`)
- VALUES (
- ?, ?, ?,
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?),
- (SELECT log((SELECT CAST(COUNT(*) as REAL) FROM documents) / COUNT(*)) FROM term_weights WHERE vocabulary_id = ?) * ?)
- """, (vocabulary_id, document_id, tf, vocabulary_id, vocabulary_id, tf))
def test_log(self, to_log):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
@@ -192,21 +206,79 @@ class Database: def get_tf_idf_table(self):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
- cursor.execute("SELECT * FROM `tf_idf` LIMIT 100;")
+ cursor.execute("SELECT * FROM `tf_idf_singles` LIMIT 100;")
out = cursor.fetchall()
print(len(out))
print(("vocabulary_id", "document_id", "weight", "tf", "n", "df", "idf"))
for l in out[:100]:
print(l)
- def get_tf_idf_score(self, term, tf_idf_thresh = 0, limit = 1000):
+ def get_tf_idf_table2(self):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT * FROM `tf_idf_linked` LIMIT 100;")
+ out = cursor.fetchall()
+ for l in out[:100]:
+ print(l)
+
+ def get_tf_idf_score_single(self, term, tf_idf_thresh = 0, limit = 1000, multiplier = 1):
with self.__connection.cursor(factory = DatabaseCursor) as cursor:
cursor.execute("""
- SELECT `document_id`, `tf_idf` FROM `tf_idf` WHERE `vocabulary_id` = (
+ SELECT `document_id`, `tf_idf` FROM `tf_idf_singles` WHERE `vocabulary_id` = (
SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?
) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
""", (term, tf_idf_thresh, limit))
- return {i[0]: i[1] for i in cursor.fetchall()}
+ return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()}
+
+ def get_tf_idf_score_linked(self, linked_term, tf_idf_thresh = 0, limit = 1000, multiplier = 1):
+ word_ids = []
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ for word in linked_term:
+ cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, ))
+ try:
+ word_ids.append(str(cursor.fetchone()[0]))
+ except TypeError:
+ continue
+
+ chain = ",".join(word_ids)
+
+ cursor.execute("""
+ SELECT `document_id`, `tf_idf` FROM `tf_idf_linked` WHERE `linked_term_id` = (
+ SELECT `linked_term_id` FROM `linked_terms` WHERE `chain` = ?
+ ) AND `tf_idf` > ? ORDER BY `tf_idf` DESC LIMIT ?;
+ """, (chain, tf_idf_thresh, limit))
+ return {i[0]: maths.log10(i[1] * multiplier) for i in cursor.fetchall()}
+
+ def attempt_get_linked_words(self, words):
+ word_ids = []
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ for word in words:
+ cursor.execute("SELECT `vocabulary_id` FROM `vocabulary` WHERE `term` = ?", (word, ))
+ try:
+ word_ids.append(str(cursor.fetchone()[0]))
+ except TypeError:
+ continue
+
+ combinations = [",".join(word_ids)]
+ for n in range(2, len(word_ids)):
+ combinations += [",".join(i) for i in nltk.ngrams(word_ids, n)]
+ # print(combinations)
+
+ if len(combinations) == 0:
+ return []
+ elif len(combinations) == 1:
+ sql = "SELECT `chain` FROM `linked_terms` WHERE `chain` = '%s';" % combinations[0]
+ else:
+ sql = "SELECT `chain` FROM `linked_terms` WHERE `chain` IN %s;" % str(tuple(combinations))
+ # print(sql)
+ cursor.execute(sql)
+ found_ids = [i[0].split(",") for i in cursor.fetchall()]
+
+ return [[self.get_term_by_id(j) for j in i] for i in found_ids]
+
+ def get_term_by_id(self, id_):
+ with self.__connection.cursor(factory = DatabaseCursor) as cursor:
+ cursor.execute("SELECT `term` FROM `vocabulary` WHERE `vocabulary_id` = ?;", (id_, ))
+ return cursor.fetchone()[0]
if __name__ == "__main__":
with Database() as db:
@@ -218,4 +290,4 @@ if __name__ == "__main__": # db.get_tf_idf_table()
#for i, v in db.get_tf_idf_score("enzyme", 1).items():
# print(i, v)
- print(db.get_max_linked_terms())
+ print(db.attempt_get_linked_words(["a", "computer", "science"]))
@@ -1,6 +1,11 @@ +from nltk.corpus import wordnet
+from nltk import pos_tag
+import collections
+import itertools
import database
import logging
import terms
+import time
import sys
import re
@@ -12,32 +17,68 @@ logging.basicConfig( logging.StreamHandler()
])
+WORDNET_POS_MAP = {
+ 'NN': wordnet.NOUN,
+ 'NNS': wordnet.NOUN,
+ 'NNP': wordnet.NOUN,
+ 'NNPS': wordnet.NOUN,
+ 'JJ': [wordnet.ADJ, wordnet.ADJ_SAT],
+ 'JJS': [wordnet.ADJ, wordnet.ADJ_SAT],
+ 'RB': wordnet.ADV,
+ 'RBR': wordnet.ADV,
+ 'RBS': wordnet.ADV,
+ 'RP': [wordnet.ADJ, wordnet.ADJ_SAT],
+ 'VB': wordnet.VERB,
+}
+
def main(search_words):
-
- txt = [re.sub(r"[^a-zA-Z\s]", "", i).rstrip().lower() for i in search_words]
-
- search_words = []
- for i in txt:
- search_words += re.split(r"\s+|\n", i)
-
- search_words = [terms.LEM.lemmatize(i) for i in search_words if i != "" and i not in terms.STOPWORDS]
- logging.info("Started searching. Using terms: %s" % " ".join(search_words))
+ starttime = time.time()
+ pos_tags = [(token, tag) for token, tag in pos_tag(search_words) if token.lower().replace(",", "") not in terms.STOPWORDS]
+
+ single_terms = [w.lower() for w in search_words]
+ logging.info("Started with the terms: %s" % str(single_terms))
+ with database.Database() as db:
+ l = db.attempt_get_linked_words(single_terms)
+ linked_terms = collections.Counter([",".join(i) for i in l])
+ # do again so we get a weight of 2
+ linked_terms += collections.Counter([",".join(i) for i in l])
+ logging.info("Found the linked terms: %s" % str(l))
+
+ synsets = [wordnet.synsets(token, WORDNET_POS_MAP[tag]) for token, tag in pos_tags if WORDNET_POS_MAP.__contains__(tag)]
+ synonyms = list(itertools.chain.from_iterable([[lemma.name().lower().replace("_", ",") for syn in synset for lemma in syn.lemmas()] for synset in synsets]))
+
+ # for syn in synsets:
+ # for sy in syn:
+ # print([w for s in sy.closure(lambda s:s.hyponyms()) for w in s.lemma_names()])
+
+ for synonym in synonyms:
+ if len(synonym.split(",")) > 1:
+ linked_terms[synonym] = 1
+ else:
+ single_terms.append(synonym)
+
+ single_terms = collections.Counter(single_terms)
+
+ logging.info("Expanded single terms to: %s" % str(single_terms))
+ logging.info("Expanded linked terms to: %s" % str(linked_terms))
+ logging.info("\n\n")
with database.Database() as db:
- tf_idf_scores = []
- for term in search_words:
- tf_idf_scores.append(db.get_tf_idf_score(term, tf_idf_thresh = 1, limit = 1000))
- logging.info("Fetched %d scores for term '%s'..." % (len(tf_idf_scores[-1]), term))
-
- merged_scores = {i: 0 for i in range(1, db.get_num_documents() + 1)}
- for scorelist in tf_idf_scores:
- for doc_id, score in scorelist.items():
- merged_scores[doc_id] += score
- logging.info("Merged scores...")
-
- sorted_scores = list(reversed(sorted(merged_scores.items(), key = lambda i: i[1])))
+ tf_idf_scores = collections.Counter()
+ for single_term, search_weight in single_terms.items():
+ scores = collections.Counter(db.get_tf_idf_score_single(single_term, tf_idf_thresh = 1, limit = 1000, multiplier = search_weight))
+ logging.info("Got %d scores for term '%s' (multiplier %d)" % (len(scores), single_term, search_weight))
+ tf_idf_scores += scores
+
+ for linked_terms, search_weight in linked_terms.items():
+ scores = db.get_tf_idf_score_linked(linked_terms.split(","), tf_idf_thresh=0, multiplier=search_weight)
+ logging.info("Got %d scores for linked term '%s' (multiplier %d)" % (len(scores), str(linked_terms), search_weight))
+ tf_idf_scores += scores
+
+ sorted_scores = list(reversed(sorted(tf_idf_scores.items(), key = lambda i: i[1])))
toshow = 30
logging.info("Sorted scores...")
+ logging.info("Results:\n\n")
for i, j in enumerate(sorted_scores, 0):
if i >= toshow:
@@ -46,7 +87,7 @@ def main(search_words): docid, score = j
logging.info("%.2f - %d - %s" % (score, docid, db.get_document_name_by_id(docid)))
- logging.info("%d results found in total" % len([i[1] for i in sorted_scores if i[1] > 0.1]))
+ logging.info("Got %d results in total. Took %.2f minutes (%.2fs per term)" % (len(tf_idf_scores), (time.time() - starttime) / 60, (time.time() - starttime) / (len(single_terms) + len(linked_terms))))
if __name__ == "__main__":
@@ -41,7 +41,7 @@ def main(): #break def parse_region(raw_text, region_weight, document_id): - print("d: %d; w: %d; len = %d" % (document_id, region_weight, len(raw_text))) + print("d: %d; w: %d; len: %d" % (document_id, region_weight, len(raw_text)), end = "") terms = word_tokenize(raw_text) terms = [re.sub(r"[^a-zA-Z0-9\s]", "", term).rstrip().lower() for term in terms] terms = [LEM.lemmatize(i) for i in terms if i != "" and i not in STOPWORDS] @@ -71,6 +71,7 @@ def append_region(terms, linked_words, region_weight, document_id): weighted_terms = {i[0]:i[1] * region_weight for i in collections.Counter(terms).items()} weighted_linked_terms = {i[0]:i[1] * region_weight for i in collections.Counter(linked_words_ids).items()} + print("; t: %d; lt: %d" % (len(weighted_terms), len(weighted_linked_terms))) return weighted_terms, weighted_linked_terms def parse_document(document_id, document_path, numdocs): @@ -6,6 +6,7 @@ def main(): db.build_tf_idf_table()
db.get_tf_idf_table()
+ db.get_tf_idf_table2()
def calc_log_tf(tf):
if tf == 0:
|