Skip to content

enhancement: improve english text normalization in english.py #53

@Ishita-190

Description

@Ishita-190

Description

In english.py, the function text_normalize() currently contains a TODO (# todo: eng text normalize) and performs only minimal punctuation replacement.

Current implementation

def text_normalize(text):
    # todo: eng text normalize
    # 适配中文及 g2p_en 标点
    rep_map = {
        "[;::,;]": ",",
        '["’]': "'",
        "。": ".",
        "!": "!",
        "?": "?",
    }
    for p, r in rep_map.items():
        text = re.sub(p, r, text)

    # 来自 g2p_en 文本格式化处理
    # 增加大写兼容
    text = unicode(text)
    text = normalize_numbers(text)
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                    if unicodedata.category(char) != 'Mn')  # Strip accents
    text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
    text = re.sub(r"(?i)i\.e\.", "that is", text)
    text = re.sub(r"(?i)e\.g\.", "for example", text)

    # 避免重复标点引起的参考泄露
    text = replace_consecutive_punctuation(text)

    return text

Proposed implementation

def text_normalize(text):
    # 适配中文及 g2p_en 标点
    if not text:
        return ""

    # ensure string
    text = str(text)

    # punctuation compatibility (smart quotes, dashes, ellipsis, CJK punctuation)
    rep_map = {
        "[;::,;]": ",",
        "[\u2018\u2019`‘’]": "'",
        "[\u201c\u201d\u00ab\u00bb\u201e\"]": '"',
        "\u2026": "...",
        "[\u2013\u2014]": "-",
        "。": ".",
        "!": "!",
        "?": "?",
    }
    for p, r in rep_map.items():
        text = re.sub(p, r, text)

    # 来自 g2p_en 文本格式化处理
    # 增加大写兼容
    try:
        text = normalize_numbers(text)
    except Exception:
        pass

    # strip accents
    text = "".join(
        ch for ch in unicodedata.normalize('NFD', text)
        if unicodedata.category(ch) != 'Mn'
    )

    # keep letters, digits, quotes/apostrophes, basic punctuation and hyphen
    text = re.sub(r"[^ 0-9a-z'\".,?!\-]", "", text)

    # expand abbreviations (word boundaries, case-insensitive)
    text = re.sub(r"\bi\.e\.\b", "that is", text, flags=re.IGNORECASE)
    text = re.sub(r"\be\.g\.\b", "for example", text, flags=re.IGNORECASE)

    # 避免重复标点引起的参考泄露
    try:
        text = replace_consecutive_punctuation(text)
    except Exception:
        text = re.sub(r"([.,?!\-])\1+", r"\1", text)

    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # ensure terminal punctuation for TTS stability
    if text and text[-1] not in ".?!":
        text += "."

    return text

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions