POLAR/rule_explanations.py at main · microsoft/POLAR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Supervision functions are derived from https://github.com/snorkel-team/snorkel-extraction/tree/master/tutorials/cdr

"""
This script contains functions to compile evidences from the supervision functions, which will be
provided to the GPT models for dynamic self-supervision.
The rules to signal the supervision functions was already precomputed into the weak labels.
We only provide evidences for the voted supervision functions.
"""


##### Distant supervision approaches
# We'll use the [Comparative Toxicogenomics Database](http://ctdbase.org/) (CTD) for distant supervision.
# The CTD lists chemical-condition entity pairs under three categories: therapy, marker, and unspecified.
# Therapy means the chemical treats the condition, marker means the chemical is typically present with the condition,
# and unspecified is...unspecified. We can write LFs based on these categories.

### LF_in_ctd_unspecified
def LF_in_ctd_unspecified(c):
    return 'According to the Comparative Toxicogenomics Database, the relation between the given chemical-condition pair is listed, confirming the answer. '

### LF_in_ctd_therapy
def LF_in_ctd_therapy(c):
    return f'According to the Comparative Toxicogenomics Database, the given chemical-condition pair "{c["entity1"]}-{c["entity2"]}" is listed that the chemical actually treats the condition, so the answer that {c["entity1"]} does not induce {c["entity2"]} is confirmed. '

### LF_in_ctd_marker
def LF_in_ctd_marker(c):
    return f'According to the Comparative Toxicogenomics Database, the given chemical-condition pair "{c["entity1"]}-{c["entity2"]}" is listed that the chemical is typically present with the condition, which may confirm the answer if {c["entity1"]} induces {c["entity2"]}. '


##### Text pattern approaches
# Now we'll use some LF helpers to create LFs based on indicative text patterns.
# We came up with these rules by using the viewer to examine training candidates and noting frequent patterns.

import re

# List to parenthetical
def ltp(x):
    return '(' + '|'.join(x) + ')'

### LF_induce
def LF_induce(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_d_induced_by_c
causal_past = ['induced', 'caused', 'due']
def LF_d_induced_by_c(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_d_induced_by_c_tight
def LF_d_induced_by_c_tight(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_induce_name
def LF_induce_name(c):
    return f'The expression "{c["entity1"]}" indicates that the disease might be induced by the drug. '

### LF_c_cause_d
causal = ['cause[sd]?', 'induce[sd]?', 'associated with']
def LF_c_cause_d(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', it is likely that {c['entity1']} induces {c['entity2']}. "


### LF_d_treat_c
treat = ['treat', 'effective', 'prevent', 'resistant', 'slow', 'promise', 'therap']
def LF_d_treat_c(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_c_treat_d
def LF_c_treat_d(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', {c['entity1']} actually treats {c['entity2']}. , so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_treat_d
def LF_treat_d(c):
    span = re.search(ltp(treat) + '.{0,50}' + c['entity2'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', {c['entity1']} actually treats {c['entity2']}, so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_c_treat_d_wide
def LF_c_treat_d_wide(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', {c['entity1']} actually treats {c['entity2']}. , so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_c_d
def LF_c_d(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', {c['entity1']} is closely mentioned with {c['entity2']}, so they should be closely related. "

### LF_c_induced_d
def LF_c_induced_d(c):
    start = min(c['span1'][0], c['span2'][0])
    end = max(c['span1'][1], c['span2'][1])
    return f"Based on the expression '{c['text'][start:end]}', {c['entity1']} is closely mentioned with {c['entity2']}. , so it is likely that {c['entity1']} induces {c['entity2']}. "


### LF_improve_before_disease
def LF_improve_before_disease(c):
    span = re.search('improv.*' + re.escape(c['entity2']), c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', the disease {c['entity2']} is actually improved, so it is not likely that {c['entity1']} induces {c['entity2']}. "


### LF_in_patient_with
pat_terms = ['in a patient with ', 'in patients with']
def LF_in_patient_with(c):
    span = re.search(ltp(pat_terms) + '.{0,5}' + c['entity2'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', {c['entity2']} is the initial condition of the patient(s), so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_uncertain
uncertain = ['combin', 'possible', 'unlikely']
def LF_uncertain(c):
    span = re.search(ltp(uncertain), c['text'], re.IGNORECASE).span()
    ends = []
    if c['span1'][1] > span[1]:
        ends.append(c['span1'][1])
    if c['span2'][1] > span[1]:
        ends.append(c['span2'][1])
    evidence = c['text'][span[0] : min(ends)]
    return f"Based on the expression '{evidence}', it is uncertain that {c['entity1']} induces {c['entity2']}. "


### LF_induced_other
def LF_induced_other(c):
    span = re.search('-induced' + '.{0,5}' + c['entity2'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', {c['entity2']} is induced by other factors, so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_far_c_d
def LF_far_c_d(c):
    return f"{c['entity1']} and {c['entity2']} are not closely mentioned in the text, so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_far_d_c
def LF_far_d_c(c):
    return f"{c['entity1']} and {c['entity2']} are not closely mentioned in the text, so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_risk_d
def LF_risk_d(c):
    span = re.search('risk of' + '.{0,5}' + c['entity2'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_develop_d_following_c
def LF_develop_d_following_c(c):
    span = re.search('develop.{0,25}' + c['entity2'] + '.{0,25}following.{0,25}' + c['entity1'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_d_following_c
procedure, following = ['inject', 'administrat'], ['following']
def LF_d_following_c(c):
    span = re.search(c['entity2'] + '.{0,50}' + ltp(following) + '.{0,20}' + c['entity1'] + '.{0,50}' + ltp(procedure), c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is likely that {c['entity1']} induces {c['entity2']}. "

### LF_measure
def LF_measure(c):
    span = re.search('measur.{0,75}' + re.escape(c['entity1']), c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is not likely that {c['entity1']} induces {c['entity2']}. "


### LF_level
def LF_level(c):
    span = re.search(c['entity1'] + '.{0,25} level', c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is not likely that {c['entity1']} induces {c['entity2']}. "


### LF_neg_d
def LF_neg_d(c):
    if re.search('(none|not|no) .{0,25}' + c['entity1'], c['text'], re.IGNORECASE):
        span = re.search('(none|not|no) .{0,25}' + c['entity1'], c['text'], re.IGNORECASE).span()
    elif re.search('(none|not|no) .{0,25}' + c['entity2'], c['text'], re.IGNORECASE):
        span = re.search('(none|not|no) .{0,25}' + c['entity2'], c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"Based on the expression '{evidence}', it is not likely that {c['entity1']} induces {c['entity2']}. "


### LF_weak_assertions
WEAK_PHRASES = ['none', 'although', 'was carried out', 'was conducted',
                'seems', 'suggests', 'risk', 'implicated',
               'the aim', 'to (investigate|assess|study)']

WEAK_RGX = r'|'.join(WEAK_PHRASES)
def LF_weak_assertions(c):
    span = re.search(WEAK_RGX, c['text'], re.IGNORECASE).span()
    evidence = c['text'][span[0] : span[1]]
    return f"According to phrases like '{evidence}', there is no strong signal that {c['entity1']} induces {c['entity2']}. "


##### Composite LFs

# The following LFs take some of the strongest distant supervision and text pattern LFs,
# and combine them to form more specific LFs. These LFs introduce some obvious
# dependencies within the LF set, which we will model later.

### LF_ctd_marker_c_d
def LF_ctd_marker_c_d(c):
    return LF_in_ctd_marker(c)

### LF_ctd_marker_induce
def LF_ctd_marker_induce(c):
    return LF_in_ctd_marker(c)

### LF_ctd_therapy_treat
def LF_ctd_therapy_treat(c):
    return LF_in_ctd_therapy(c)

### LF_ctd_unspecified_treat
def LF_ctd_unspecified_treat(c):
    return LF_in_ctd_unspecified(c)

### LF_ctd_unspecified_induce
def LF_ctd_unspecified_induce(c):
    return LF_in_ctd_unspecified(c)


##### Rules based on context hierarchy
# These last two rules will make use of the context hierarchy.
# The first checks if there is a chemical mention much closer to the candidate's disease mention
# than the candidate's chemical mention. The second does the analog for diseases.

### LF_closer_chem
def LF_closer_chem(c):
    return f"According to the text, another chemical is mentioned closer to {c['entity2']} than {c['entity1']}, so it is not likely that {c['entity1']} induces {c['entity2']}. "

### LF_closer_dis
def LF_closer_dis(c):
    return f"According to the text, another disease is mentioned closer to {c['entity1']} than {c['entity2']}, so it is not likely that {c['entity1']} induces {c['entity2']}. "


LFs = [
    LF_c_cause_d,
    LF_c_d,
    LF_c_induced_d,
    LF_c_treat_d,
    LF_c_treat_d_wide,
    LF_closer_chem,
    LF_closer_dis,
    LF_ctd_marker_c_d,
    LF_ctd_marker_induce,
    LF_ctd_therapy_treat,
    LF_ctd_unspecified_treat,
    LF_ctd_unspecified_induce,
    LF_d_following_c,
    LF_d_induced_by_c,
    LF_d_induced_by_c_tight,
    LF_d_treat_c,
    LF_develop_d_following_c,
    LF_far_c_d,
    LF_far_d_c,
    LF_improve_before_disease,
    LF_in_ctd_therapy,
    LF_in_ctd_marker,
    LF_in_patient_with,
    LF_induce,
    LF_induce_name,
    LF_induced_other,
    LF_level,
    LF_measure,
    LF_neg_d,
    LF_risk_d,
    LF_treat_d,
    LF_uncertain,
    LF_weak_assertions,
]


def _append_explanation(j, x, l, explain):
    """
    Helper function to get explanation for the supervision function j on example x. l is the label from SF j.
    """
    if l < 0:
        return explain
    evidence = LFs[j](x)
    if evidence not in explain:
        explain += evidence
    return explain

"""
Main function to call to compile the evidences.
The rules to signal the supervision functions was already precomputed into the weak labels.
We only provide evidences for the voted supervision functions.
"""
def get_explanation_from_lfs(x, L):
    explain = ''
    for j in range(len(L)):
        if j in [1, 7]:
            for jj in [1, 7]:
                explain = _append_explanation(jj, x, L[jj], explain)
        elif j in [2, 14, 8]:
            for jj in [2, 14, 8, 11]:
                explain = _append_explanation(jj, x, L[jj], explain)
        elif j in [4, 9, 10]:
            for jj in [4, 9, 10]:
                explain = _append_explanation(jj, x, L[jj], explain)
        else:
            explain = _append_explanation(j, x, L[j], explain)
    return explain