Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ _site
.jekyll-cache
.jekyll-metadata
vendor
repomix-output.xml
repomix-output.xml
*.pyc
144 changes: 124 additions & 20 deletions src/core/check_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import re
import csv

WORD_PATTERN = re.compile(r"[א-תA-Za-z0-9'\"׳]+")


def artist_from_song(my_file):
"""
הפונקציה בודקת את שם האמן בשם הקובץ על סמך מסד נתונים ומאחסנת את שם האמן במשתנה.
Expand All @@ -14,15 +17,14 @@ def artist_from_song(my_file):
החזרות:
str: הערך המכיל את שם האמן מהקובץ.
"""

# קבל את שם הקובץ ללא הנתיב המלא
split_file = os.path.split(my_file)[1]
split_file = os.path.splitext(split_file)[0]

# הסר תווים לא רצויים משם הקובץ
split_file = re.sub(r'[_-]', ' ', split_file)


# ייבא את רשימת הזמרים מקובץ csv
if 'singer_list' not in globals():
csv_path = "singer-list.csv"
Expand All @@ -39,18 +41,13 @@ def artist_from_song(my_file):

# חזור על רשימת השמות ובדוק אם אחד מהם קיים בשם הקובץ
for source_name, target_name in singer_list:
if source_name in split_file:
artist = target_name

# בדיקת דיוק שם הקובץ
exact = check_exact_name(split_file, source_name)
if exact: return artist
# בדיקת דיוק/דמיון שם הקובץ
if check_exact_name(split_file, source_name):
return target_name

return




def check_exact_name(filename, artist_to_search):
"""
Check if the artist's name appears exactly in the filename, even if preceded by "ו".
Expand All @@ -62,29 +59,136 @@ def check_exact_name(filename, artist_to_search):
Returns:
bool: True if the artist's name is found exactly in the filename (even if preceded by "ו"), False otherwise.
"""


if not filename or not artist_to_search:
return False

# Remove leading spaces in the filename
filename = filename.lstrip()

# Escape special characters in the artist's name
escaped_artist = re.escape(artist_to_search)

# Define a pattern to match the exact artist name, even if preceded by "ו"
exact_match_pattern = fr'(^|[^א-ת])ו?{escaped_artist}\b'

# Search for the exact artist name in the filename
if re.search(exact_match_pattern, filename):
return True

return _is_similar_name_match(filename, artist_to_search)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Avoid fuzzy distance checks for every candidate name

Falling back to _is_similar_name_match on every non-exact check makes each singer-list scan run edit-distance logic for almost all rows, including obvious non-candidates. In practice this is a major runtime regression for no-hit inputs (local check against singer-list.csv: ~28.5s for 200 lookups vs ~0.013s previously), so large sorting runs can slow dramatically when filenames/metadata do not match known artists.

Useful? React with 👍 / 👎.



def _tokenize_words(text):
return WORD_PATTERN.findall(text)


def _max_allowed_word_distance(word_length):
if word_length <= 4:
return 0
if word_length <= 8:
return 1
if word_length <= 12:
return 2
return 3


def _max_allowed_phrase_distance(total_letters):
if total_letters <= 8:
return 1
if total_letters <= 15:
return 2
if total_letters <= 24:
return 3
return 4


def _is_prefix_or_suffix_expansion(first_word, second_word):
if abs(len(first_word) - len(second_word)) != 1:
return False

longer_word, shorter_word = (
(first_word, second_word)
if len(first_word) > len(second_word)
else (second_word, first_word)
)

# מניעת false positive כמו "אלי" מול "יואלי" או "מוטי" מול "למוטי".
return longer_word[1:] == shorter_word or longer_word[:-1] == shorter_word


def _levenshtein_distance(first_word, second_word, max_distance=None):
if first_word == second_word:
return 0

if len(first_word) < len(second_word):
first_word, second_word = second_word, first_word

previous_row = list(range(len(second_word) + 1))

for index_first, char_first in enumerate(first_word, start=1):
current_row = [index_first]
min_in_row = current_row[0]

for index_second, char_second in enumerate(second_word, start=1):
insertions = previous_row[index_second] + 1
deletions = current_row[index_second - 1] + 1
substitutions = previous_row[index_second - 1] + (char_first != char_second)
best_cost = min(insertions, deletions, substitutions)
current_row.append(best_cost)
if best_cost < min_in_row:
min_in_row = best_cost

if max_distance is not None and min_in_row > max_distance:
return max_distance + 1

previous_row = current_row

return previous_row[-1]


def _is_similar_name_match(filename, artist_to_search):
filename_words = _tokenize_words(filename)
artist_words = _tokenize_words(artist_to_search)

if not filename_words or not artist_words:
return False

artist_words_count = len(artist_words)
max_window_index = len(filename_words) - artist_words_count + 1

if max_window_index <= 0:
return False

total_artist_letters = sum(len(word) for word in artist_words)
max_phrase_distance = _max_allowed_phrase_distance(total_artist_letters)

for start_index in range(max_window_index):
candidate_words = filename_words[start_index:start_index + artist_words_count]
total_distance = 0

for candidate_word, artist_word in zip(candidate_words, artist_words):
if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
break

max_word_distance = _max_allowed_word_distance(len(artist_word))
word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)

if word_distance > max_word_distance:
break

total_distance += word_distance
if total_distance > max_phrase_distance:
break
else:
return True
Comment on lines +168 to +184

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The use of the is_valid_window flag can be simplified by using a for...else block. The else block of a for loop is executed when the loop completes without encountering a break statement. This makes the code more concise and Pythonic.

Suggested change
total_distance = 0
is_valid_window = True
for candidate_word, artist_word in zip(candidate_words, artist_words):
if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
is_valid_window = False
break
max_word_distance = _max_allowed_word_distance(len(artist_word))
word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
if word_distance > max_word_distance:
is_valid_window = False
break
total_distance += word_distance
if total_distance > max_phrase_distance:
is_valid_window = False
break
if is_valid_window:
return True
total_distance = 0
for candidate_word, artist_word in zip(candidate_words, artist_words):
if _is_prefix_or_suffix_expansion(candidate_word, artist_word):
break
max_word_distance = _max_allowed_word_distance(len(artist_word))
word_distance = _levenshtein_distance(candidate_word, artist_word, max_word_distance)
if word_distance > max_word_distance:
break
total_distance += word_distance
if total_distance > max_phrase_distance:
break
else:
return True


return False





if __name__ == '__main__':


list_ = ['ח בני פרידמן, מוטי שטיינמ.mp3', '@יואלי קליין=.mp3', 'ואברהם פריד.mp3', 'שיר נוסף - מוטי שטיינמץל מ.mp3']

list_ = ['ח בני פרידמן, מוטי שטיינמ.mp3', '@יואלי קליין=.mp3', 'ואברהם פריד.mp3', 'שיר נוסף - מוטי שטיינמץל מ.mp3']

for i in list_:
print(artist_from_song(i))
print(artist_from_song(i))
36 changes: 16 additions & 20 deletions src/core/singles_sorter_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,10 @@ def handle_album_transfer(self, album_path, album_name, artist_name):
# Determine artist name from the singer list
determined_artist_name = None
for source_name, target_name in self.singer_list:
if source_name in artist_name:
exact = check_exact_name(artist_name, source_name)
if exact:
determined_artist_name = target_name
break
exact = check_exact_name(artist_name, source_name)
if exact:
determined_artist_name = target_name
break

# Use the determined artist name or the original if not found in the list
final_artist_name = determined_artist_name if determined_artist_name else artist_name
Expand Down Expand Up @@ -726,11 +725,10 @@ def artists_from_song(self, my_file):

# שלב ראשון: בדיקת שם הקובץ באמצעות רשימת הזמרים
for source_name, target_name in self.singer_list:
if source_name in split_file:
exact = check_exact_name(split_file, source_name)
if exact:
found_artists.append(target_name)
break # מצאנו אמן, אין צורך להמשיך
exact = check_exact_name(split_file, source_name)
if exact:
Comment on lines 727 to +729

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Run exact singer matching before fuzzy matching

This loop now breaks on the first check_exact_name hit, but check_exact_name includes fuzzy Levenshtein matching, so an earlier near-duplicate in self.singer_list can win before the true exact artist entry is reached. With the current src/core/app/singer-list.csv, an input like אליקם בוטה is matched to the earlier אליקים בוטה row, which routes files to the wrong artist folder even when the exact name exists later in the list.

Useful? React with 👍 / 👎.

found_artists.append(target_name)
break # מצאנו אמן, אין צורך להמשיך

try:
metadata_file = load_file(my_file)
Expand All @@ -756,11 +754,10 @@ def artists_from_song(self, my_file):
artist = fix_jibrish(artist, "heb")
# בדיקת אם האמן נמצא ברשימת הזמרים
for source_name, target_name in self.singer_list:
if source_name in artist:
exact = check_exact_name(artist, source_name)
if exact:
found_artists.append(target_name)
break
exact = check_exact_name(artist, source_name)
if exact:
found_artists.append(target_name)
break

if not found_artists and self.check_artist(artist):
# אם האמן לא נמצא ברשימה, וב-AIModels זמין
Expand All @@ -780,11 +777,10 @@ def artists_from_song(self, my_file):
title = sanitized_title
title = fix_jibrish(title, "heb")
for source_name, target_name in self.singer_list:
if source_name in title:
exact = check_exact_name(title, source_name)
if exact:
found_artists.append(target_name)
break
exact = check_exact_name(title, source_name)
if exact:
found_artists.append(target_name)
break

if not found_artists and not ai_invalid:
# שלב רביעי: שימוש ב-NER על שם הקובץ
Expand Down
63 changes: 63 additions & 0 deletions src/tests/test_check_name_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys
import unittest
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parents[1] / "core"))

from check_name import check_exact_name


class TestCheckNameSimilarity(unittest.TestCase):
def test_exact_match_still_supported(self):
self.assertTrue(check_exact_name("מוטי שטיינמץ - שיר חדש", "מוטי שטיינמץ"))

def test_vav_prefix_still_supported(self):
self.assertTrue(check_exact_name("ועם ומוטי שטיינמץ", "מוטי שטיינמץ"))

def test_one_letter_difference_in_word_is_supported(self):
self.assertTrue(check_exact_name("שיר חדש של ויסמנדל", "וייסמנדל"))
self.assertTrue(check_exact_name("ביצוע של וייסמנדל", "ויסמנדל"))

def test_short_name_does_not_match_with_one_letter_change(self):
self.assertFalse(check_exact_name("אלה קליין - הופעה", "אלי קליין"))

def test_long_name_supports_two_differences(self):
self.assertTrue(
check_exact_name(
"דואט עם אברהם מרדכי שוורצ", # חסרה אות אחת במילה האחרונה
"אברהם מרדכי שוורץ",
)
)
self.assertTrue(
check_exact_name(
"דואט עם אברהם מורדכי שוורז", # 2 הבדלים מפוזרים בשם ארוך
"אברהם מרדכי שוורץ",
)
)

def test_long_name_rejects_when_difference_too_large(self):
self.assertFalse(
check_exact_name(
"דואט עם אברם מורדכע שוורזז",
"אברהם מרדכי שוורץ",
)
)

def test_prevent_false_positive_for_prefix_forms(self):
self.assertFalse(check_exact_name("שיר נוסף - למוטי שטיינמץ", "מוטי שטיינמץ"))

def test_prevent_false_positive_for_suffix_forms(self):
self.assertFalse(check_exact_name("שיר נוסף - מוטי שטיינמץל", "מוטי שטיינמץ"))

def test_prevent_false_positive_for_distinct_names(self):
self.assertFalse(check_exact_name("יואלי קליין - הופעה", "אלי קליין"))

def test_multi_word_window_matching(self):
self.assertTrue(check_exact_name("ביצוע חי - אלי קלינן", "אלי קליין"))

def test_window_requires_same_word_count(self):
self.assertFalse(check_exact_name("הופעה של מרדכי", "אברהם מרדכי שוורץ"))


if __name__ == "__main__":
unittest.main()