NHLOCAL · NHLOCAL · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026 · chatgpt-codex-connector
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@ _site
 .jekyll-cache
 .jekyll-metadata
 vendor
-repomix-output.xml
+repomix-output.xml
+*.pyc
diff --git a/src/core/check_name.py b/src/core/check_name.py
@@ -3,6 +3,9 @@
 import re
 import csv
 
+WORD_PATTERN = re.compile(r"[א-תA-Za-z0-9'\"׳]+")
+
+
 def artist_from_song(my_file):
     """
     הפונקציה בודקת את שם האמן בשם הקובץ על סמך מסד נתונים ומאחסנת את שם האמן במשתנה.
@@ -14,15 +17,14 @@ def artist_from_song(my_file):
         החזרות:
             str: הערך המכיל את שם האמן מהקובץ.
     """
-      
+
     # קבל את שם הקובץ ללא הנתיב המלא
     split_file = os.path.split(my_file)[1]
     split_file = os.path.splitext(split_file)[0]
 
     # הסר תווים לא רצויים משם הקובץ
     split_file = re.sub(r'[_-]', ' ', split_file)
 
-
     # ייבא את רשימת הזמרים מקובץ csv
     if 'singer_list' not in globals():
         csv_path = "singer-list.csv"
@@ -39,18 +41,13 @@ def artist_from_song(my_file):
 
     # חזור על רשימת השמות ובדוק אם אחד מהם קיים בשם הקובץ
     for source_name, target_name in singer_list:
-        if source_name in split_file:
-            artist = target_name
-
-            # בדיקת דיוק שם הקובץ
-            exact = check_exact_name(split_file, source_name)
-            if exact: return artist
+        # בדיקת דיוק/דמיון שם הקובץ
+        if check_exact_name(split_file, source_name):
+            return target_name
 
     return
 
 
-
-
 def check_exact_name(filename, artist_to_search):
     """
     Check if the artist's name appears exactly in the filename, even if preceded by "ו".
@@ -62,29 +59,136 @@ def check_exact_name(filename, artist_to_search):
     Returns:
     bool: True if the artist's name is found exactly in the filename (even if preceded by "ו"), False otherwise.
     """
-
+
+    if not filename or not artist_to_search:
+        return False
+
     # Remove leading spaces in the filename
     filename = filename.lstrip()
-    
+
     # Escape special characters in the artist's name
     escaped_artist = re.escape(artist_to_search)
-    
+
     # Define a pattern to match the exact artist name, even if preceded by "ו"
     exact_match_pattern = fr'(^|[^א-ת])ו?{escaped_artist}\b'
 
     # Search for the exact artist name in the filename
     if re.search(exact_match_pattern, filename):
         return True
 
+    return _is_similar_name_match(filename, artist_to_search)
+
+
+def _tokenize_words(text):
+    return WORD_PATTERN.findall(text)
+
+
+def _max_allowed_word_distance(word_length):
+    if word_length <= 4:
+        return 0
+    if word_length <= 8:
+        return 1
+    if word_length <= 12:
+        return 2
+    return 3
+
+
+def _max_allowed_phrase_distance(total_letters):
+    if total_letters <= 8:
+        return 1
+    if total_letters <= 15:
+        return 2
+    if total_letters <= 24:
+        return 3
+    return 4
+
+
+def _is_prefix_or_suffix_expansion(first_word, second_word):
+    if abs(len(first_word) - len(second_word)) != 1:
+        return False
+
+    longer_word, shorter_word = (
+        (first_word, second_word)
+        if len(first_word) > len(second_word)
+        else (second_word, first_word)
+    )
+
+    # מניעת false positive כמו "אלי" מול "יואלי" או "מוטי" מול "למוטי".
+    return longer_word[1:] == shorter_word or longer_word[:-1] == shorter_word
+
+
+def _levenshtein_distance(first_word, second_word, max_distance=None):
+    if first_word == second_word:
+        return 0
+
+    if len(first_word) < len(second_word):
+        first_word, second_word = second_word, first_word
+
+    previous_row = list(range(len(second_word) + 1))
+
+    for index_first, char_first in enumerate(first_word, start=1):
+        current_row = [index_first]
+        min_in_row = current_row[0]
+
+        for index_second, char_second in enumerate(second_word, start=1):
+            insertions = previous_row[index_second] + 1
+            deletions = current_row[index_second - 1] + 1
+            substitutions = previous_row[index_second - 1] + (char_first != char_second)
+            best_cost = min(insertions, deletions, substitutions)
+            current_row.append(best_cost)
+            if best_cost < min_in_row:
+                min_in_row = best_cost
+
+        if max_distance is not None and min_in_row > max_distance:
+            return max_distance + 1
+
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def _is_similar_name_match(filename, artist_to_search):
+    filename_words = _tokenize_words(filename)
+    artist_words = _tokenize_words(artist_to_search)
+
+    if not filename_words or not artist_words:
+        return False
+
+    artist_words_count = len(artist_words)
+    max_window_index = len(filename_words) - artist_words_count + 1
+
+    if max_window_index <= 0:
+        return False
+
+    total_artist_letters = sum(len(word) for word in artist_words)
+    max_phrase_distance = _max_allowed_phrase_distance(total_artist_letters)
+
+    for start_index in range(max_window_index):
+        candidate_words = filename_words[start_index:start_index + artist_words_count]
+        total_distance = 0
+
+        for candidate_word, artist_word in zip(candidate_words, artist_words):
+            if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
+                break
+
+            max_word_distance = _max_allowed_word_distance(len(artist_word))
+            word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
+
+            if word_distance > max_word_distance:
+                break
+
+            total_distance += word_distance
+            if total_distance > max_phrase_distance:
+                break
+        else:
+            return True
-        total_distance = 0
-        is_valid_window = True
-
-        for candidate_word, artist_word in zip(candidate_words, artist_words):
-            if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
-                is_valid_window = False
-                break
-
-            max_word_distance = _max_allowed_word_distance(len(artist_word))
-            word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
-
-            if word_distance > max_word_distance:
-                is_valid_window = False
-                break
-
-            total_distance += word_distance
-            if total_distance > max_phrase_distance:
-                is_valid_window = False
-                break
-
-        if is_valid_window:
-            return True
+        total_distance = 0
+        for candidate_word, artist_word in zip(candidate_words, artist_words):
+            if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
+                break
+
+            max_word_distance = _max_allowed_word_distance(len(artist_word))
+            word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
+
+            if word_distance > max_word_distance:
+                break
+
+            total_distance += word_distance
+            if total_distance > max_phrase_distance:
+                break
+        else:
+            return True
-        total_distance = 0
-        is_valid_window = True
-
-        for candidate_word, artist_word in zip(candidate_words, artist_words):
-            if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
-                is_valid_window = False
-                break
-
-            max_word_distance = _max_allowed_word_distance(len(artist_word))
-            word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
-
-            if word_distance > max_word_distance:
-                is_valid_window = False
-                break
-
-            total_distance += word_distance
-            if total_distance > max_phrase_distance:
-                is_valid_window = False
-                break
-
-        if is_valid_window:
-            return True
+        total_distance = 0
+        for candidate_word, artist_word in zip(candidate_words, artist_words):
+            if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
+                break
+
+            max_word_distance = _max_allowed_word_distance(len(artist_word))
+            word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
+
+            if word_distance > max_word_distance:
+                break
+
+            total_distance += word_distance
+            if total_distance > max_phrase_distance:
+                break
+        else:
+            return True
+
     return False
 
-
-
-
+
 if __name__ == '__main__':
 
-
-    list_ = ['ח בני פרידמן, מוטי שטיינמ.mp3', '@יואלי קליין=.mp3', 'ואברהם פריד.mp3', 'שיר נוסף - מוטי שטיינמץל מ.mp3'] 
-
+    list_ = ['ח בני פרידמן, מוטי שטיינמ.mp3', '@יואלי קליין=.mp3', 'ואברהם פריד.mp3', 'שיר נוסף - מוטי שטיינמץל מ.mp3']
+
     for i in list_:
-        print(artist_from_song(i))
+        print(artist_from_song(i))
diff --git a/src/core/singles_sorter_v5.py b/src/core/singles_sorter_v5.py
@@ -462,11 +462,10 @@ def handle_album_transfer(self, album_path, album_name, artist_name):
             # Determine artist name from the singer list
             determined_artist_name = None
             for source_name, target_name in self.singer_list:
-                if source_name in artist_name:
-                    exact = check_exact_name(artist_name, source_name)
-                    if exact:
-                        determined_artist_name = target_name
-                        break
+                exact = check_exact_name(artist_name, source_name)
+                if exact:
+                    determined_artist_name = target_name
+                    break
 
             # Use the determined artist name or the original if not found in the list
             final_artist_name = determined_artist_name if determined_artist_name else artist_name
@@ -726,11 +725,10 @@ def artists_from_song(self, my_file):
 
         # שלב ראשון: בדיקת שם הקובץ באמצעות רשימת הזמרים
         for source_name, target_name in self.singer_list:
-            if source_name in split_file:
-                exact = check_exact_name(split_file, source_name)
-                if exact:
-                    found_artists.append(target_name)
-                    break  # מצאנו אמן, אין צורך להמשיך
+            exact = check_exact_name(split_file, source_name)
+            if exact:
+                found_artists.append(target_name)
+                break  # מצאנו אמן, אין צורך להמשיך
 
         try:
             metadata_file = load_file(my_file)
@@ -756,11 +754,10 @@ def artists_from_song(self, my_file):
                 artist = fix_jibrish(artist, "heb")
                 # בדיקת אם האמן נמצא ברשימת הזמרים
                 for source_name, target_name in self.singer_list:
-                    if source_name in artist:
-                        exact = check_exact_name(artist, source_name)
-                        if exact:
-                            found_artists.append(target_name)
-                            break
+                    exact = check_exact_name(artist, source_name)
+                    if exact:
+                        found_artists.append(target_name)
+                        break
 
                 if not found_artists and self.check_artist(artist):
                     # אם האמן לא נמצא ברשימה, וב-AIModels זמין
@@ -780,11 +777,10 @@ def artists_from_song(self, my_file):
                 title = sanitized_title
                 title = fix_jibrish(title, "heb")
                 for source_name, target_name in self.singer_list:
-                    if source_name in title:
-                        exact = check_exact_name(title, source_name)
-                        if exact:
-                            found_artists.append(target_name)
-                            break
+                    exact = check_exact_name(title, source_name)
+                    if exact:
+                        found_artists.append(target_name)
+                        break
 
         if not found_artists and not ai_invalid:
             # שלב רביעי: שימוש ב-NER על שם הקובץ

diff --git a/src/tests/test_check_name_similarity.py b/src/tests/test_check_name_similarity.py
@@ -0,0 +1,63 @@
+import sys
+import unittest
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parents[1] / "core"))
+
+from check_name import check_exact_name
+
+
+class TestCheckNameSimilarity(unittest.TestCase):
+    def test_exact_match_still_supported(self):
+        self.assertTrue(check_exact_name("מוטי שטיינמץ - שיר חדש", "מוטי שטיינמץ"))
+
+    def test_vav_prefix_still_supported(self):
+        self.assertTrue(check_exact_name("ועם ומוטי שטיינמץ", "מוטי שטיינמץ"))
+
+    def test_one_letter_difference_in_word_is_supported(self):
+        self.assertTrue(check_exact_name("שיר חדש של ויסמנדל", "וייסמנדל"))
+        self.assertTrue(check_exact_name("ביצוע של וייסמנדל", "ויסמנדל"))
+
+    def test_short_name_does_not_match_with_one_letter_change(self):
+        self.assertFalse(check_exact_name("אלה קליין - הופעה", "אלי קליין"))
+
+    def test_long_name_supports_two_differences(self):
+        self.assertTrue(
+            check_exact_name(
+                "דואט עם אברהם מרדכי שוורצ",  # חסרה אות אחת במילה האחרונה
+                "אברהם מרדכי שוורץ",
+            )
+        )
+        self.assertTrue(
+            check_exact_name(
+                "דואט עם אברהם מורדכי שוורז",  # 2 הבדלים מפוזרים בשם ארוך
+                "אברהם מרדכי שוורץ",
+            )
+        )
+
+    def test_long_name_rejects_when_difference_too_large(self):
+        self.assertFalse(
+            check_exact_name(
+                "דואט עם אברם מורדכע שוורזז",
+                "אברהם מרדכי שוורץ",
+            )
+        )
+
+    def test_prevent_false_positive_for_prefix_forms(self):
+        self.assertFalse(check_exact_name("שיר נוסף - למוטי שטיינמץ", "מוטי שטיינמץ"))
+
+    def test_prevent_false_positive_for_suffix_forms(self):
+        self.assertFalse(check_exact_name("שיר נוסף - מוטי שטיינמץל", "מוטי שטיינמץ"))
+
+    def test_prevent_false_positive_for_distinct_names(self):
+        self.assertFalse(check_exact_name("יואלי קליין - הופעה", "אלי קליין"))
+
+    def test_multi_word_window_matching(self):
+        self.assertTrue(check_exact_name("ביצוע חי - אלי קלינן", "אלי קליין"))
+
+    def test_window_requires_same_word_count(self):
+        self.assertFalse(check_exact_name("הופעה של מרדכי", "אברהם מרדכי שוורץ"))
+
+
+if __name__ == "__main__":
+    unittest.main()