Refactor bm25 to include model parametrization (cont.) (#2722)

Witiko · marcelo-dalmeida · mpenkov · commit 3abcb9f94f9b · 2020-01-08T12:51:13.000+09:00
* Refactor bm25 to include model parametrization * Refactor constants back and fix typo * Refactor parameters order and description * Add BM25 tests This closes #2597 and closes #2606 * Simplify asserts in BM25 tests * Refactor BM25.get_score Co-authored-by: Marcelo d'Almeida <md@id.uff.br>
diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
@@ -69,14 +69,36 @@ class BM25(object):
         List of document lengths.
     """
 
-    def __init__(self, corpus):
+    def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
         """
         Parameters
         ----------
         corpus : list of list of str
             Given corpus.
+        k1 : float
+            Constant used for influencing the term frequency saturation. After saturation is reached, additional
+            presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
+            that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
+            the type of documents or queries.
+        b : float
+            Constant used for influencing the effects of different document lengths relative to average document length.
+            When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
+            [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
+            depends on factors such as the type of documents or queries.
+        epsilon : float
+            Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
+            negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
+            score (with 'very common' meaning that it is present in more than half of the documents). That can be
+            undesirable as it means that an identical document would score less than an almost identical one (by
+            removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
+            different documents) to receive an extra score.
 
         """
+
+        self.k1 = k1
+        self.b = b
+        self.epsilon = epsilon
+
         self.corpus_size = 0
         self.avgdl = 0
         self.doc_freqs = []
@@ -126,7 +148,7 @@ def _initialize(self, corpus):
                 ' unintuitive results.'.format(self.corpus_size)
             )
 
-        eps = EPSILON * self.average_idf
+        eps = self.epsilon * self.average_idf
         for word in negative_idfs:
             self.idf[word] = eps
 
@@ -146,13 +168,15 @@ def get_score(self, document, index):
             BM25 score.
 
         """
-        score = 0
+        score = 0.0
         doc_freqs = self.doc_freqs[index]
+        numerator_constant = self.k1 + 1
+        denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
         for word in document:
-            if word not in doc_freqs:
-                continue
-            score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
-                      / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
+            if word in doc_freqs:
+                df = self.doc_freqs[index][word]
+                idf = self.idf[word]
+                score += (idf * df * numerator_constant) / (df + denominator_constant)
         return score
 
     def get_scores(self, document):
@@ -236,7 +260,7 @@ def _get_scores(bm25, document):
     return bm25.get_scores(document)
 
 
-def iter_bm25_bow(corpus, n_jobs=1):
+def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
     """Yield BM25 scores (weights) of documents in corpus.
     Each document has to be weighted with every document in given corpus.
 
@@ -246,6 +270,23 @@ def iter_bm25_bow(corpus, n_jobs=1):
         Corpus of documents.
     n_jobs : int
         The number of processes to use for computing bm25.
+    k1 : float
+        Constant used for influencing the term frequency saturation. After saturation is reached, additional
+        presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
+        that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
+        the type of documents or queries.
+    b : float
+        Constant used for influencing the effects of different document lengths relative to average document length.
+        When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
+        [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
+        depends on factors such as the type of documents or queries.
+    epsilon : float
+        Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
+        negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
+        score (with 'very common' meaning that it is present in more than half of the documents). That can be
+        undesirable as it means that an identical document would score less than an almost identical one (by
+        removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
+        different documents) to receive an extra score.
 
     Yields
     -------
@@ -265,7 +306,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
         >>> result = iter_bm25_weights(corpus, n_jobs=-1)
 
     """
-    bm25 = BM25(corpus)
+    bm25 = BM25(corpus, k1, b, epsilon)
 
     n_processes = effective_n_jobs(n_jobs)
     if n_processes == 1:
@@ -282,7 +323,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
     pool.join()
 
 
-def get_bm25_weights(corpus, n_jobs=1):
+def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
     """Returns BM25 scores (weights) of documents in corpus.
     Each document has to be weighted with every document in given corpus.
 
@@ -292,6 +333,23 @@ def get_bm25_weights(corpus, n_jobs=1):
         Corpus of documents.
     n_jobs : int
         The number of processes to use for computing bm25.
+    k1 : float
+        Constant used for influencing the term frequency saturation. After saturation is reached, additional
+        presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
+        that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
+        the type of documents or queries.
+    b : float
+        Constant used for influencing the effects of different document lengths relative to average document length.
+        When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
+        [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
+        depends on factors such as the type of documents or queries.
+    epsilon : float
+        Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
+        negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
+        score (with 'very common' meaning that it is present in more than half of the documents). That can be
+        undesirable as it means that an identical document would score less than an almost identical one (by
+        removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
+        different documents) to receive an extra score.
 
     Returns
     -------
@@ -311,7 +369,7 @@ def get_bm25_weights(corpus, n_jobs=1):
         >>> result = get_bm25_weights(corpus, n_jobs=-1)
 
     """
-    bm25 = BM25(corpus)
+    bm25 = BM25(corpus, k1, b, epsilon)
 
     n_processes = effective_n_jobs(n_jobs)
     if n_processes == 1:
diff --git a/gensim/test/test_BM25.py b/gensim/test/test_BM25.py
@@ -11,7 +11,7 @@
 import logging
 import unittest
 
-from gensim.summarization.bm25 import get_bm25_weights
+from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25
 from gensim.test.utils import common_texts
 
 
@@ -62,6 +62,94 @@ def test_multiprocessing(self):
         self.assertAlmostEqual(weights1, weights3)
         self.assertAlmostEqual(weights2, weights3)
 
+    def test_k1(self):
+        """ Changing the k1 parameter should give consistent results """
+        corpus = common_texts
+        index = 0
+        doc = corpus[index]
+        first_k1 = 1.0
+        second_k1 = 2.0
+
+        first_bm25 = BM25(corpus, k1=first_k1)
+        second_bm25 = BM25(corpus, k1=second_k1)
+        first_score = first_bm25.get_score(doc, index)
+        second_score = second_bm25.get_score(doc, index)
+        self.assertLess(first_score, second_score)
+
+        first_iter = iter_bm25_bow(corpus, k1=first_k1)
+        second_iter = iter_bm25_bow(corpus, k1=second_k1)
+        first_score = dict(next(iter(first_iter)))[index]
+        second_score = dict(next(iter(second_iter)))[index]
+        self.assertLess(first_score, second_score)
+
+        first_weights = get_bm25_weights(corpus, k1=first_k1)
+        second_weights = get_bm25_weights(corpus, k1=second_k1)
+        first_score = first_weights[index]
+        second_score = second_weights[index]
+        self.assertLess(first_score, second_score)
+
+    def test_b(self):
+        """ Changing the b parameter should give consistent results """
+        corpus = common_texts
+        index = 0
+        doc = corpus[index]
+        first_b = 1.0
+        second_b = 2.0
+
+        first_bm25 = BM25(corpus, b=first_b)
+        second_bm25 = BM25(corpus, b=second_b)
+        first_score = first_bm25.get_score(doc, index)
+        second_score = second_bm25.get_score(doc, index)
+        self.assertLess(first_score, second_score)
+
+        first_iter = iter_bm25_bow(corpus, b=first_b)
+        second_iter = iter_bm25_bow(corpus, b=second_b)
+        first_score = dict(next(iter(first_iter)))[index]
+        second_score = dict(next(iter(second_iter)))[index]
+        self.assertLess(first_score, second_score)
+
+        first_weights = get_bm25_weights(corpus, b=first_b)
+        second_weights = get_bm25_weights(corpus, b=second_b)
+        first_score = first_weights[index]
+        second_score = second_weights[index]
+        self.assertLess(first_score, second_score)
+
+    def test_epsilon(self):
+        """ Changing the b parameter should give consistent results """
+        corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
+        first_epsilon = 1.0
+        second_epsilon = 2.0
+        bm25 = BM25(corpus)
+        words_with_negative_idfs = set([
+            word
+            for word, idf in bm25.idf.items()
+            if idf < 0
+        ])
+        index, doc = [
+            (index, document)
+            for index, document
+            in enumerate(corpus)
+            if words_with_negative_idfs & set(document)
+        ][0]
+
+        first_bm25 = BM25(corpus, epsilon=first_epsilon)
+        second_bm25 = BM25(corpus, epsilon=second_epsilon)
+        first_score = first_bm25.get_score(doc, index)
+        second_score = second_bm25.get_score(doc, index)
+        self.assertGreater(first_score, second_score)
+
+        first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon)
+        second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon)
+        first_score = dict(next(iter(first_iter)))[index]
+        second_score = dict(next(iter(second_iter)))[index]
+        self.assertGreater(first_score, second_score)
+
+        first_weights = get_bm25_weights(corpus, epsilon=first_epsilon)
+        second_weights = get_bm25_weights(corpus, epsilon=second_epsilon)
+        first_score = first_weights[index]
+        second_score = second_weights[index]
+        self.assertGreater(first_score, second_score)
+
 
 if __name__ == '__main__':
     logging.basicConfig(level=logging.DEBUG)