From cf887d7636361382684c69edff7f44bd4cd2108b Mon Sep 17 00:00:00 2001 From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com> Date: Sat, 28 Mar 2020 07:24:25 +0500 Subject: [PATCH] improve regex for html tag removal It can be done in a single line. Replaced with space. --- nlppreprocess/nlppreprocess.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py index 5a2ca87..e6dea1b 100644 --- a/nlppreprocess/nlppreprocess.py +++ b/nlppreprocess/nlppreprocess.py @@ -89,24 +89,21 @@ def replace_words_fun(self): def remove_numbers_fun(self): """ - This function uses regex to remve + This function uses regex to remove all the numbers from the doc. """ self.doc = re.sub("[0-9]", "", self.doc) def remove_html_tags_fun(self): """ - This function uses regex's complile method - to remove all the HTML tags from the doc + This function uses regex to remove + all the HTML tags from the doc """ - cleaner = re.compile('<.*?>') - cleaned_text = re.sub(cleaner, '', self.doc) - cleaned_text = re.sub('[\n\t]', '', cleaned_text) - self.doc = cleaned_text + self.doc = re.sub(r"<[^<>]+>", ' ', self.doc) def remove_punctations_fun(self): """ - This function uses regex to remove alk the + This function uses regex to remove all the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)