From cf887d7636361382684c69edff7f44bd4cd2108b Mon Sep 17 00:00:00 2001
From: MubashirullahD <45071858+MubashirullahD@users.noreply.github.com>
Date: Sat, 28 Mar 2020 07:24:25 +0500
Subject: [PATCH] improve regex for html tag removal

It can be done in a single line. Replaced with space.
---
 nlppreprocess/nlppreprocess.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/nlppreprocess/nlppreprocess.py b/nlppreprocess/nlppreprocess.py
index 5a2ca87..e6dea1b 100644
--- a/nlppreprocess/nlppreprocess.py
+++ b/nlppreprocess/nlppreprocess.py
@@ -89,24 +89,21 @@ def replace_words_fun(self):
 
     def remove_numbers_fun(self):
         """
-        This function uses regex to remve
+        This function uses regex to remove
         all the numbers from the doc.
         """
         self.doc = re.sub("[0-9]", "", self.doc)
 
     def remove_html_tags_fun(self):
         """
-        This function uses regex's complile method
-        to remove all the HTML tags from the doc
+        This function uses regex to remove
+        all the HTML tags from the doc
         """
-        cleaner = re.compile('<.*?>')
-        cleaned_text = re.sub(cleaner, '', self.doc)
-        cleaned_text = re.sub('[\n\t]', '', cleaned_text)
-        self.doc = cleaned_text
+        self.doc = re.sub(r"<[^<>]+>", ' ', self.doc)
 
     def remove_punctations_fun(self):
         """
-        This function uses regex to remove alk the
+        This function uses regex to remove all the
         punctations from the doc.
         """ 
         self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc)