Skip to content

Commit 3abcb9f

Browse files
Witikomarcelo-dalmeida
authored andcommitted
Refactor bm25 to include model parametrization (cont.) (#2722)
* Refactor bm25 to include model parametrization * Refactor constants back and fix typo * Refactor parameters order and description * Add BM25 tests This closes #2597 and closes #2606 * Simplify asserts in BM25 tests * Refactor BM25.get_score Co-authored-by: Marcelo d'Almeida <[email protected]>
1 parent 3d129de commit 3abcb9f

File tree

2 files changed

+158
-12
lines changed

2 files changed

+158
-12
lines changed

gensim/summarization/bm25.py

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,36 @@ class BM25(object):
6969
List of document lengths.
7070
"""
7171

72-
def __init__(self, corpus):
72+
def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
7373
"""
7474
Parameters
7575
----------
7676
corpus : list of list of str
7777
Given corpus.
78+
k1 : float
79+
Constant used for influencing the term frequency saturation. After saturation is reached, additional
80+
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
81+
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
82+
the type of documents or queries.
83+
b : float
84+
Constant used for influencing the effects of different document lengths relative to average document length.
85+
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
86+
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
87+
depends on factors such as the type of documents or queries.
88+
epsilon : float
89+
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
90+
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
91+
score (with 'very common' meaning that it is present in more than half of the documents). That can be
92+
undesirable as it means that an identical document would score less than an almost identical one (by
93+
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
94+
different documents) to receive an extra score.
7895
7996
"""
97+
98+
self.k1 = k1
99+
self.b = b
100+
self.epsilon = epsilon
101+
80102
self.corpus_size = 0
81103
self.avgdl = 0
82104
self.doc_freqs = []
@@ -126,7 +148,7 @@ def _initialize(self, corpus):
126148
' unintuitive results.'.format(self.corpus_size)
127149
)
128150

129-
eps = EPSILON * self.average_idf
151+
eps = self.epsilon * self.average_idf
130152
for word in negative_idfs:
131153
self.idf[word] = eps
132154

@@ -146,13 +168,15 @@ def get_score(self, document, index):
146168
BM25 score.
147169
148170
"""
149-
score = 0
171+
score = 0.0
150172
doc_freqs = self.doc_freqs[index]
173+
numerator_constant = self.k1 + 1
174+
denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
151175
for word in document:
152-
if word not in doc_freqs:
153-
continue
154-
score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
155-
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
176+
if word in doc_freqs:
177+
df = self.doc_freqs[index][word]
178+
idf = self.idf[word]
179+
score += (idf * df * numerator_constant) / (df + denominator_constant)
156180
return score
157181

158182
def get_scores(self, document):
@@ -236,7 +260,7 @@ def _get_scores(bm25, document):
236260
return bm25.get_scores(document)
237261

238262

239-
def iter_bm25_bow(corpus, n_jobs=1):
263+
def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
240264
"""Yield BM25 scores (weights) of documents in corpus.
241265
Each document has to be weighted with every document in given corpus.
242266
@@ -246,6 +270,23 @@ def iter_bm25_bow(corpus, n_jobs=1):
246270
Corpus of documents.
247271
n_jobs : int
248272
The number of processes to use for computing bm25.
273+
k1 : float
274+
Constant used for influencing the term frequency saturation. After saturation is reached, additional
275+
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
276+
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
277+
the type of documents or queries.
278+
b : float
279+
Constant used for influencing the effects of different document lengths relative to average document length.
280+
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
281+
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
282+
depends on factors such as the type of documents or queries.
283+
epsilon : float
284+
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
285+
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
286+
score (with 'very common' meaning that it is present in more than half of the documents). That can be
287+
undesirable as it means that an identical document would score less than an almost identical one (by
288+
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
289+
different documents) to receive an extra score.
249290
250291
Yields
251292
-------
@@ -265,7 +306,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
265306
>>> result = iter_bm25_weights(corpus, n_jobs=-1)
266307
267308
"""
268-
bm25 = BM25(corpus)
309+
bm25 = BM25(corpus, k1, b, epsilon)
269310

270311
n_processes = effective_n_jobs(n_jobs)
271312
if n_processes == 1:
@@ -282,7 +323,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
282323
pool.join()
283324

284325

285-
def get_bm25_weights(corpus, n_jobs=1):
326+
def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
286327
"""Returns BM25 scores (weights) of documents in corpus.
287328
Each document has to be weighted with every document in given corpus.
288329
@@ -292,6 +333,23 @@ def get_bm25_weights(corpus, n_jobs=1):
292333
Corpus of documents.
293334
n_jobs : int
294335
The number of processes to use for computing bm25.
336+
k1 : float
337+
Constant used for influencing the term frequency saturation. After saturation is reached, additional
338+
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
339+
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
340+
the type of documents or queries.
341+
b : float
342+
Constant used for influencing the effects of different document lengths relative to average document length.
343+
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
344+
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
345+
depends on factors such as the type of documents or queries.
346+
epsilon : float
347+
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
348+
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
349+
score (with 'very common' meaning that it is present in more than half of the documents). That can be
350+
undesirable as it means that an identical document would score less than an almost identical one (by
351+
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
352+
different documents) to receive an extra score.
295353
296354
Returns
297355
-------
@@ -311,7 +369,7 @@ def get_bm25_weights(corpus, n_jobs=1):
311369
>>> result = get_bm25_weights(corpus, n_jobs=-1)
312370
313371
"""
314-
bm25 = BM25(corpus)
372+
bm25 = BM25(corpus, k1, b, epsilon)
315373

316374
n_processes = effective_n_jobs(n_jobs)
317375
if n_processes == 1:

gensim/test/test_BM25.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import logging
1212
import unittest
1313

14-
from gensim.summarization.bm25 import get_bm25_weights
14+
from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25
1515
from gensim.test.utils import common_texts
1616

1717

@@ -62,6 +62,94 @@ def test_multiprocessing(self):
6262
self.assertAlmostEqual(weights1, weights3)
6363
self.assertAlmostEqual(weights2, weights3)
6464

65+
def test_k1(self):
66+
""" Changing the k1 parameter should give consistent results """
67+
corpus = common_texts
68+
index = 0
69+
doc = corpus[index]
70+
first_k1 = 1.0
71+
second_k1 = 2.0
72+
73+
first_bm25 = BM25(corpus, k1=first_k1)
74+
second_bm25 = BM25(corpus, k1=second_k1)
75+
first_score = first_bm25.get_score(doc, index)
76+
second_score = second_bm25.get_score(doc, index)
77+
self.assertLess(first_score, second_score)
78+
79+
first_iter = iter_bm25_bow(corpus, k1=first_k1)
80+
second_iter = iter_bm25_bow(corpus, k1=second_k1)
81+
first_score = dict(next(iter(first_iter)))[index]
82+
second_score = dict(next(iter(second_iter)))[index]
83+
self.assertLess(first_score, second_score)
84+
85+
first_weights = get_bm25_weights(corpus, k1=first_k1)
86+
second_weights = get_bm25_weights(corpus, k1=second_k1)
87+
first_score = first_weights[index]
88+
second_score = second_weights[index]
89+
self.assertLess(first_score, second_score)
90+
91+
def test_b(self):
92+
""" Changing the b parameter should give consistent results """
93+
corpus = common_texts
94+
index = 0
95+
doc = corpus[index]
96+
first_b = 1.0
97+
second_b = 2.0
98+
99+
first_bm25 = BM25(corpus, b=first_b)
100+
second_bm25 = BM25(corpus, b=second_b)
101+
first_score = first_bm25.get_score(doc, index)
102+
second_score = second_bm25.get_score(doc, index)
103+
self.assertLess(first_score, second_score)
104+
105+
first_iter = iter_bm25_bow(corpus, b=first_b)
106+
second_iter = iter_bm25_bow(corpus, b=second_b)
107+
first_score = dict(next(iter(first_iter)))[index]
108+
second_score = dict(next(iter(second_iter)))[index]
109+
self.assertLess(first_score, second_score)
110+
111+
first_weights = get_bm25_weights(corpus, b=first_b)
112+
second_weights = get_bm25_weights(corpus, b=second_b)
113+
first_score = first_weights[index]
114+
second_score = second_weights[index]
115+
self.assertLess(first_score, second_score)
116+
117+
def test_epsilon(self):
118+
""" Changing the b parameter should give consistent results """
119+
corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
120+
first_epsilon = 1.0
121+
second_epsilon = 2.0
122+
bm25 = BM25(corpus)
123+
words_with_negative_idfs = set([
124+
word
125+
for word, idf in bm25.idf.items()
126+
if idf < 0
127+
])
128+
index, doc = [
129+
(index, document)
130+
for index, document
131+
in enumerate(corpus)
132+
if words_with_negative_idfs & set(document)
133+
][0]
134+
135+
first_bm25 = BM25(corpus, epsilon=first_epsilon)
136+
second_bm25 = BM25(corpus, epsilon=second_epsilon)
137+
first_score = first_bm25.get_score(doc, index)
138+
second_score = second_bm25.get_score(doc, index)
139+
self.assertGreater(first_score, second_score)
140+
141+
first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon)
142+
second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon)
143+
first_score = dict(next(iter(first_iter)))[index]
144+
second_score = dict(next(iter(second_iter)))[index]
145+
self.assertGreater(first_score, second_score)
146+
147+
first_weights = get_bm25_weights(corpus, epsilon=first_epsilon)
148+
second_weights = get_bm25_weights(corpus, epsilon=second_epsilon)
149+
first_score = first_weights[index]
150+
second_score = second_weights[index]
151+
self.assertGreater(first_score, second_score)
152+
65153

66154
if __name__ == '__main__':
67155
logging.basicConfig(level=logging.DEBUG)

0 commit comments

Comments
 (0)