@@ -69,14 +69,36 @@ class BM25(object):
6969 List of document lengths.
7070 """
7171
72- def __init__ (self , corpus ):
72+ def __init__ (self , corpus , k1 = PARAM_K1 , b = PARAM_B , epsilon = EPSILON ):
7373 """
7474 Parameters
7575 ----------
7676 corpus : list of list of str
7777 Given corpus.
78+ k1 : float
79+ Constant used for influencing the term frequency saturation. After saturation is reached, additional
80+ presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
81+ that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
82+ the type of documents or queries.
83+ b : float
84+ Constant used for influencing the effects of different document lengths relative to average document length.
85+ When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
86+ [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
87+ depends on factors such as the type of documents or queries.
88+ epsilon : float
89+ Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
90+ negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
91+ score (with 'very common' meaning that it is present in more than half of the documents). That can be
92+ undesirable as it means that an identical document would score less than an almost identical one (by
93+ removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
94+ different documents) to receive an extra score.
7895
7996 """
97+
98+ self .k1 = k1
99+ self .b = b
100+ self .epsilon = epsilon
101+
80102 self .corpus_size = 0
81103 self .avgdl = 0
82104 self .doc_freqs = []
@@ -126,7 +148,7 @@ def _initialize(self, corpus):
126148 ' unintuitive results.' .format (self .corpus_size )
127149 )
128150
129- eps = EPSILON * self .average_idf
151+ eps = self . epsilon * self .average_idf
130152 for word in negative_idfs :
131153 self .idf [word ] = eps
132154
@@ -146,13 +168,15 @@ def get_score(self, document, index):
146168 BM25 score.
147169
148170 """
149- score = 0
171+ score = 0.0
150172 doc_freqs = self .doc_freqs [index ]
173+ numerator_constant = self .k1 + 1
174+ denominator_constant = self .k1 * (1 - self .b + self .b * self .doc_len [index ] / self .avgdl )
151175 for word in document :
152- if word not in doc_freqs :
153- continue
154- score += ( self .idf [word ] * doc_freqs [ word ] * ( PARAM_K1 + 1 )
155- / ( doc_freqs [ word ] + PARAM_K1 * ( 1 - PARAM_B + PARAM_B * self . doc_len [ index ] / self . avgdl )) )
176+ if word in doc_freqs :
177+ df = self . doc_freqs [ index ][ word ]
178+ idf = self .idf [word ]
179+ score += ( idf * df * numerator_constant ) / ( df + denominator_constant )
156180 return score
157181
158182 def get_scores (self , document ):
@@ -236,7 +260,7 @@ def _get_scores(bm25, document):
236260 return bm25 .get_scores (document )
237261
238262
239- def iter_bm25_bow (corpus , n_jobs = 1 ):
263+ def iter_bm25_bow (corpus , n_jobs = 1 , k1 = PARAM_K1 , b = PARAM_B , epsilon = EPSILON ):
240264 """Yield BM25 scores (weights) of documents in corpus.
241265 Each document has to be weighted with every document in given corpus.
242266
@@ -246,6 +270,23 @@ def iter_bm25_bow(corpus, n_jobs=1):
246270 Corpus of documents.
247271 n_jobs : int
248272 The number of processes to use for computing bm25.
273+ k1 : float
274+ Constant used for influencing the term frequency saturation. After saturation is reached, additional
275+ presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
276+ that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
277+ the type of documents or queries.
278+ b : float
279+ Constant used for influencing the effects of different document lengths relative to average document length.
280+ When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
281+ [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
282+ depends on factors such as the type of documents or queries.
283+ epsilon : float
284+ Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
285+ negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
286+ score (with 'very common' meaning that it is present in more than half of the documents). That can be
287+ undesirable as it means that an identical document would score less than an almost identical one (by
288+ removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
289+ different documents) to receive an extra score.
249290
250291 Yields
251292 -------
@@ -265,7 +306,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
265306 >>> result = iter_bm25_weights(corpus, n_jobs=-1)
266307
267308 """
268- bm25 = BM25 (corpus )
309+ bm25 = BM25 (corpus , k1 , b , epsilon )
269310
270311 n_processes = effective_n_jobs (n_jobs )
271312 if n_processes == 1 :
@@ -282,7 +323,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
282323 pool .join ()
283324
284325
285- def get_bm25_weights (corpus , n_jobs = 1 ):
326+ def get_bm25_weights (corpus , n_jobs = 1 , k1 = PARAM_K1 , b = PARAM_B , epsilon = EPSILON ):
286327 """Returns BM25 scores (weights) of documents in corpus.
287328 Each document has to be weighted with every document in given corpus.
288329
@@ -292,6 +333,23 @@ def get_bm25_weights(corpus, n_jobs=1):
292333 Corpus of documents.
293334 n_jobs : int
294335 The number of processes to use for computing bm25.
336+ k1 : float
337+ Constant used for influencing the term frequency saturation. After saturation is reached, additional
338+ presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
339+ that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
340+ the type of documents or queries.
341+ b : float
342+ Constant used for influencing the effects of different document lengths relative to average document length.
343+ When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
344+ [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
345+ depends on factors such as the type of documents or queries.
346+ epsilon : float
347+ Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
348+ negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
349+ score (with 'very common' meaning that it is present in more than half of the documents). That can be
350+ undesirable as it means that an identical document would score less than an almost identical one (by
351+ removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
352+ different documents) to receive an extra score.
295353
296354 Returns
297355 -------
@@ -311,7 +369,7 @@ def get_bm25_weights(corpus, n_jobs=1):
311369 >>> result = get_bm25_weights(corpus, n_jobs=-1)
312370
313371 """
314- bm25 = BM25 (corpus )
372+ bm25 = BM25 (corpus , k1 , b , epsilon )
315373
316374 n_processes = effective_n_jobs (n_jobs )
317375 if n_processes == 1 :
0 commit comments