diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv new file mode 100644 index 000000000..8373c52df --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv @@ -0,0 +1,15 @@ +mobile नंबर +mobile मोबाइल +mobile फोन +mobile कॉल +landline नंबर +landline मोबाइल +landline फोन +landline लैंडलाइन +landline कॉल +pincode पिन +pincode कोड +pincode पिनकोड +credit नंबर +credit कार्ड +credit क्रेडिट \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv new file mode 100644 index 000000000..268d326b1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv @@ -0,0 +1 @@ +९१ नौ एक \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv similarity index 80% rename from nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv rename to nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv index 53c5e36cb..6049cbf50 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv @@ -1,4 +1,3 @@ -० zero १ one २ two ३ three @@ -7,4 +6,4 @@ ६ six ७ seven ८ eight -९ nine +९ nine \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv new file mode 100644 index 000000000..769cbb603 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv @@ -0,0 +1 @@ +० zero \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index f3f0c0434..ad584b58b 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -15,143 +15,146 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_CHAR, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path +shunya = ( + pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")).invert() +) +digit_without_shunya = ( + pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + | pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")).invert() +) +digit = digit_without_shunya | shunya -class TelephoneFst(GraphFst): - """ - Finite state transducer for classifying telephone numbers, e.g. - e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } - Args: - Cardinal: CardinalFst - """ - def __init__(self, cardinal: GraphFst): - super().__init__(name="telephone", kind="classify") +def get_context(keywords: list): + keywords = pynini.union(*keywords) - hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() - hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + # Load Hindi digits from TSV files + hindi_digits = ( + pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + ).project("output") - english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + # Load English digits from TSV files + english_digits = ( + pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")) + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")) + ).project("output") - country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file( - get_abs_path("data/telephone/eng_to_hindi_digit.tsv") - ).invert() + all_digits = hindi_digits | english_digits - country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file( - get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") - ).invert() + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) + word = pynini.closure(non_digit_char, 1) + NEMO_WHITE_SPACE + window = pynini.closure(word, 0, 5) + before = (keywords + window).optimize() + after = (window + keywords).optimize() - self.hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 9) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 9) - + english_digit_graph - + delete_space - + pynutil.insert("\" ") - ) + return before, after - self.country_code_with_single_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.country_code_with_double_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits - # two, three, four-digit extension code with zero - self.city_code_hindi = ( - pynutil.insert("extension: \"") - + pynini.closure(hindi_digit_graph + delete_space, 2, 5) - + pynutil.insert("\" ") - ) - self.city_code_english = ( - pynutil.insert("extension: \"") - + pynini.closure(english_digit_graph + delete_space, 2, 5) - + pynutil.insert("\" ") - ) +def generate_context_graph(context_keywords, length): + context_before, context_after = get_context(context_keywords) + digits = pynini.closure(digit + delete_space, length - 1, length - 1) + digit - self.city_extension = self.city_code_hindi | self.city_code_english + graph_after_context = digits + NEMO_WHITE_SPACE + context_after + graph_before_context = context_before + NEMO_WHITE_SPACE + digits + graph_without_context = digits - # 7-digit landline graph in hindi and english digits - self.landline_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 7, 7) - + pynutil.insert("\" ") - ) - self.landline_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 7, 7) - + pynutil.insert("\" ") - ) + return ( + pynutil.insert("number_part: \"") + + (graph_before_context | graph_after_context | graph_without_context) + + pynutil.insert("\" ") + ).optimize() - self.landline = self.landline_hindi | self.landline_english - self.pincode_in_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 5) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.pincode_in_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 5) - + english_digit_graph - + pynutil.insert("\" ") - ) +def generate_pincode(context_keywords): + return generate_context_graph(context_keywords, 6) - self.credit_card_last_digits_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 3) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.credit_card_last_digits_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 3) - + english_digit_graph - + pynutil.insert("\" ") - ) - delete_plus = pynini.union( - pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") - ) +def generate_credit(context_keywords): + return generate_context_graph(context_keywords, 4) - delete_zero = pynini.union( - pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") - ) - graph_number_with_hindi_digit = ( - delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit - ) - graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit +def generate_mobile(context_keywords): + context_before, context_after = get_context(context_keywords) + + country_code = pynini.cross("प्लस", "+") + pynini.closure(delete_space + digit, 2, 2) + NEMO_WHITE_SPACE + graph_country_code = ( + pynutil.insert("country_code: \"") + + (context_before + NEMO_WHITE_SPACE) ** (0, 1) + + country_code + + pynutil.insert("\" ") + ) + + number_part = digit_without_shunya + delete_space + pynini.closure(digit + delete_space, 8, 8) + digit + graph_number = ( + pynutil.insert("number_part: \"") + + number_part + + pynini.closure(NEMO_WHITE_SPACE + context_after, 0, 1) + + pynutil.insert("\" ") + ) + + graph = (graph_country_code + graph_number) | graph_number + return graph.optimize() + + +def generate_telephone(context_keywords): + context_before, context_after = get_context(context_keywords) + + landline = shunya + delete_space + pynini.closure(digit + delete_space, 9, 9) + digit + landline_with_context_before = context_before + NEMO_WHITE_SPACE + landline + landline_with_context_after = landline + NEMO_WHITE_SPACE + context_after + + return ( + pynutil.insert("number_part: \"") + + (landline | landline_with_context_before | landline_with_context_after) + + pynutil.insert("\" ") + ) + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + # Load context cues from TSV file + context_cues = pynini.string_file(get_abs_path("data/telephone/context_cues.tsv")) + + # Extract keywords for each category + mobile_keywords = pynini.compose(pynutil.delete("mobile"), context_cues).project("output").optimize() + + landline_keywords = pynini.compose(pynutil.delete("landline"), context_cues).project("output").optimize() - graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + pincode_keywords = pynini.compose(pynutil.delete("pincode"), context_cues).project("output").optimize() - graph_pincode = self.pincode_in_hindi | self.pincode_in_english + credit_keywords = pynini.compose(pynutil.delete("credit"), context_cues).project("output").optimize() - graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + # Convert FSTs to keyword lists for generate_* functions + mobile = generate_mobile([mobile_keywords]) + landline = generate_telephone([landline_keywords]) + pincode = generate_pincode([pincode_keywords]) + credit = generate_credit([credit_keywords]) graph = ( - graph_number_with_hindi_digit - | graph_number_with_english_digit - | graph_landline_with_extension - | graph_pincode - | graph_credit_card_last_digits + pynutil.add_weight(mobile, 0.7) + | pynutil.add_weight(landline, 0.8) + | pynutil.add_weight(credit, 0.9) + | pynutil.add_weight(pincode, 1) ) - final_graph = self.add_tokens(graph) - self.fst = final_graph + self.final = graph.optimize() + self.fst = self.add_tokens(self.final) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 3f4b4de1f..fb2e5d618 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -32,7 +32,6 @@ def __init__(self, cardinal: GraphFst): number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") optional_country_code = pynini.closure( pynutil.delete("country_code: \"") - + pynutil.insert("+") + delete_space + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt index 0c51d8df0..3b84a333d 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -1,37 +1,28 @@ -प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ -प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० -plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ -प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० -plus eleven nine four one one one two three~+११ ९४१११२३ -zero eight zero two nine four one one one two~०८० २९४१११२ -शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ -zero four zero two seven eight one eight three nine~०४० २७८१८३९ -शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ -शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ -प्लस नौ एक नौ तीन आठ दो सात एक चार छह पांच शून्य~+९१ ९३८२७१४६५० -प्लस नौ एक नौ शून्य पांच एक तीन चार आठ दो सात छह~+९१ ९०५१३४८२७६ -प्लस नौ एक नौ चार तीन सात दो शून्य पांच छह एक आठ~+९१ ९४३७२०५६१८ -PLUS ninety one nine three eight two seven one four six five zero~+९१ ९३८२७१४६५० -plus nine one nine zero five one three four eight two seven six~+९१ ९०५१३४८२७६ -plus ninety one nine four three seven two zero five six one eight~+९१ ९४३७२०५६१८ -ZERO seven three चार पाँच छह सात आठ नौ शून्य~०७३ ४५६७८९० -शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ -ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० -zero two eight seven six five four three two seven~०२८ ७६५४३२७ -PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ -zero eight zero two two nine four one one one~०८० २२९४१११ -शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ -zero eight zero nine two two nine four one one one~०८०९ २२९४१११ -शून्य सात नौ नौ एक नौ आठ सात छह पांच चार~०७९९ १९८७६५४ -zero three one nine two two two nine four one one one~०३१९२ २२९४१११ -शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार~०७९११ १९८७६५४ -एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है -बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है -दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है -five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है -मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं -क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं -दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है -five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है -मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं -क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं +एक एक एक एक एक एक~११११११ +पाँच शून्य शून्य शून्य एक दो~५०००१२ +एक दो तीन चार पाँच छह~१२३४५६ +चार शून्य शून्य शून्य एक शून्य~४०००१० +सात पाँच शून्य शून्य शून्य दो~७५०००२ +आठ आठ शून्य नौ नौ शून्य~८८०९९० +नौ आठ सात छह पाँच चार तीन दो एक शून्य~९८७६५४३२१० +सात शून्य एक दो तीन चार पाँच छह सात आठ~७०१२३४५६७८ +आठ आठ आठ सात सात सात छह छह छह छह~८८८७७७६६६६ +छह दो नौ शून्य एक पाँच सात तीन चार आठ~६२९०१५७३४८ +नौ नौ आठ आठ सात सात छह छह पाँच पाँच~९९८८७७६६५५ +प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +प्लस नौ एक सात शून्य एक दो तीन चार पाँच छह सात आठ~+९१ ७०१२३४५६७८ +प्लस नौ एक आठ आठ आठ सात सात सात छह छह छह छह~+९१ ८८८७७७६६६६ +प्लस नौ एक एक एक एक एक एक एक एक एक एक एक~+९१ ११११११११११ +शून्य दो शून्य दो चार तीन सात एक पाँच चार दो~०२०२४३७१५४२ +शून्य एक एक दो छह एक दो तीन चार पाँच छह~०११२६१२३४५६ +चार चार दो दो आठ आठ छह छह चार चार~४४२२८८६६४४ +शून्य आठ शून्य चार एक दो तीन चार पाँच छह सात~०८०४१२३४५६७ +दो दो छह छह पांच चार तीन दो एक शून्य~२२६६५४३२१० +zero one three three six two three four five six seven~०१३३६२३४५६७ +zero one three four two three two one five four eight~०१३४२३२१५४८ +एक दो तीन चार~१२३४ +पाँच शून्य शून्य नौ~५००९ +चार चार चार चार~४४४४ +सात आठ नौ एक~७८९१ +एक शून्य दो शून्य~१०२० +नौ आठ सात छह~९८७६ \ No newline at end of file diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 5326784e9..468be7300 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -39,7 +39,7 @@ def parse_test_case_file(file_name: str): Prepares tests pairs for ITN and TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: for line in f: components = line.strip("\n").split("~") spoken = components[0] @@ -63,7 +63,7 @@ def get_test_cases_multiple(file_name: str = 'data_text_normalization/en/test_ca Prepares tests pairs for audio based TN tests """ test_pairs = [] - with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r', encoding='utf-8') as f: written = None normalized_options = [] for line in f: