-
Notifications
You must be signed in to change notification settings - Fork 882
Closed
Description
Hi,
Bertopic version: 0.15.0
Bertopic parameters:
self.topic_model = BERTopic(
embedding_model=SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
umap_model= UMAP(
n_neighbors=15,
n_components=5,
min_dist=0,0,
metric='cosine'
random_state=42
)
hdbscan_model= HDBSCAN(
min_cluster_size=15,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
vectorizer_model=CountVectorizer(stop_words=[***LIST OF STOPWORDS HERE***]),
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True),
representation_model=BaseRepresentation,
language='multilingual',
nr_topics='auto',
top_n_words=20,
n_gram_range=(1, 3),
verbose=True
)
Problem decription:
I have a corpus of 262759 texts. I search for topics using:
self.topics, self.probs = self.topic_model.fit_transform(self.corpus)
where self.corpus is the corpus. Then I reduce outliers:
new_topics = self.topic_model.reduce_outliers(self.corpus, self.topics, strategy=self.outliers_reduction_strategy)
self.topic_model.update_topics(self.corpus, topics=new_topics)
the strategy is 'embeddings'
After outliers reduction there was no topic -1, which is strange for me, please let me know if it is ok.
Then I try to reduce topics to 30 (self.nr_topics_reduced=30) using:
self.topic_model.reduce_topics(self.corpus, nr_topics=self.nr_topics_reduced)
and I got:
IndexError: index 988 is out of bounds for axis 0 with size 988
The traceback is below:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ C:\Users\piotr\onedrive\PycharmProjects\tematy-bert-310\slupsk-ria-tass.py:159 in <module> │
│ │
│ 156 │
│ 157 │
│ 158 if __name__ == '__main__': │
│ ❱ 159 │ main() │
│ 160 │
│ │
│ C:\Users\piotr\onedrive\PycharmProjects\tematy-bert-310\slupsk-ria-tass.py:153 in main │
│ │
│ 150 │ │ │ │ │ │ │ f'{Fore.BLUE}REPRESENTATION:{Style.RESET_ALL} {repr_models}, │
│ 151 │ │ │ │ │ │ │ f'{Fore.BLUE}LEMMA/ORIG:{Style.RESET_ALL} {lemorig}') │
│ 152 │ │ │ │ stopwords = Filters.stopwords_ru + ['новости', 'новость', 'риа', 'тасс'] │
│ ❱ 153 │ │ │ │ make_topics(corp_by_src[src][lemorig], corp_by_src[src]['date'], stopwor │
│ 154 │ │ │ │ │ │ │ lemorig, 30) │
│ 155 │ logger.info(f'{Fore.GREEN}Quitting{Style.RESET_ALL}') │
│ 156 │
│ │
│ C:\Users\piotr\onedrive\PycharmProjects\tematy-bert-310\slupsk-ria-tass.py:93 in make_topics │
│ │
│ 90 │ logger.info(f'{Fore.GREEN}Reducing outliers{Style.RESET_ALL}') │
│ 91 │ my_topics.reduce_outliers(f'{filename}') │
│ 92 │ logger.info(f'{Fore.GREEN}Reducing topics to {nr_topics_reduced}{Style.RESET_ALL}') │
│ ❱ 93 │ my_topics.reduce_topics(f'{filename}') │
│ 94 │ # logger.info(f'Reduced topics hierarchy vis') │
│ 95 │ # my_topics.topic_model.visualize_hierarchy().write_image(f'{filename}_{repr_model}_ │
│ 96 │ logger.info(f'{Fore.GREEN}Making wordclouds{Style.RESET_ALL}') │
│ │
│ C:\Users\piotr\OneDrive\python\pgc\nlp\topics.py:270 in reduce_topics │
│ │
│ 267 │ │
│ 268 │ def reduce_topics(self, filename): │
│ 269 │ │ self.logger.info(f'{Fore.GREEN}Reducing topics to {self.nr_topics_reduced}{Style │
│ ❱ 270 │ │ self.topic_model.reduce_topics(self.corpus, nr_topics=self.nr_topics_reduced) │
│ 271 │ │ # self.generate_and_set_topic_labels() │
│ 272 │ │ self.save_model_and_topics(filename, f'topics_reduced_{self.nr_topics_reduced}') │
│ 273 │
│ │
│ C:\Users\piotr\OneDrive\PycharmProjects\tematy-bert-310\lib\site-packages\bertopic\_bertopic.py: │
│ 2000 in reduce_topics │
│ │
│ 1997 │ │ documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": imag │
│ 1998 │ │ │
│ 1999 │ │ # Reduce number of topics │
│ ❱ 2000 │ │ documents = self._reduce_topics(documents) │
│ 2001 │ │ self._merged_topics = None │
│ 2002 │ │ self._save_representative_docs(documents) │
│ 2003 │ │ self.probabilities_ = self._map_probabilities(self.probabilities_) │
│ │
│ C:\Users\piotr\OneDrive\PycharmProjects\tematy-bert-310\lib\site-packages\bertopic\_bertopic.py: │
│ 3603 in _reduce_topics │
│ │
│ 3600 │ │ │
│ 3601 │ │ if isinstance(self.nr_topics, int): │
│ 3602 │ │ │ if self.nr_topics < initial_nr_topics: │
│ ❱ 3603 │ │ │ │ documents = self._reduce_to_n_topics(documents) │
│ 3604 │ │ elif isinstance(self.nr_topics, str): │
│ 3605 │ │ │ documents = self._auto_reduce_topics(documents) │
│ 3606 │ │ else: │
│ │
│ C:\Users\piotr\OneDrive\PycharmProjects\tematy-bert-310\lib\site-packages\bertopic\_bertopic.py: │
│ 3637 in _reduce_to_n_topics │
│ │
│ 3634 │ │ else: │
│ 3635 │ │ │ cluster = AgglomerativeClustering(self.nr_topics - self._outliers, affinity= │
│ 3636 │ │ cluster.fit(distance_matrix) │
│ ❱ 3637 │ │ new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] │
│ 3638 │ │ │
│ 3639 │ │ # Track mappings and sizes of topics for merging topic embeddings │
│ 3640 │ │ mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, ne │
│ │
│ C:\Users\piotr\OneDrive\PycharmProjects\tematy-bert-310\lib\site-packages\bertopic\_bertopic.py: │
│ 3637 in <listcomp> │
│ │
│ 3634 │ │ else: │
│ 3635 │ │ │ cluster = AgglomerativeClustering(self.nr_topics - self._outliers, affinity= │
│ 3636 │ │ cluster.fit(distance_matrix) │
│ ❱ 3637 │ │ new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] │
│ 3638 │ │ │
│ 3639 │ │ # Track mappings and sizes of topics for merging topic embeddings │
│ 3640 │ │ mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, ne │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
IndexError: index 988 is out of bounds for axis 0 with size 988
What am I doing wrong? Shall I first reduce topics, then outliers?
Bests
Piotr.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels