Create the model (TL;DR version)

We start by loading required packages

import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
import spacy
from hdbscan import HDBSCAN

Step 1: Loading the data

data = pd.read_csv("entr_eco_publications.csv", sep = ";")

The input data for bertopic needs to have a specific format: it need to be a list. We therefore transform our dataframe into a list. In addition, we only need the column that has the text, in our case the “abstract” column.

# We first select only the column with the text:
data = data['abstract']
# We transform the dataframe into a list:
data = data.apply(lambda row: ' '.join(row.values.astype(str)),axis=1).tolist() 

Step 2: Build the topic model

# Step 1: create the embeddings
embed_model = SentenceTransformer("all-MiniLM-L6-V2")
# Step 2: create the dimension reduction model
umap_model = UMAP(n_neighbors = 30, n_components = 5, min_dist = 0.0, metric = "cosine", random_state = 42)
# Step 3: create the clustering model
hdbscan_model = HDBSCAN(min_cluster_size = 7, metric = "euclidean", cluster_selection_method = 'eom', prediction_data = True)
# Step 4: create the representation model
vect_model = CountVectorizer(stop_words = "english", min_df = 2, ngram_range = (1,1)) 
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity = 0.3)
representation_model = {
    "keyBERT": keybert_model,
    "MMR": mmr_model,
 #  "POS": pos_model
}

Now we regroup all the models into one which will be our topic model:

# regroup all the models into one
topic_model = BERTopic(
    embedding_model = embed_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    vectorizer_model = vect_model,
    representation_model = representation_model,
    top_n_words = 20, # how many words to include in the representation
    verbose = True # this options ensures that the function returns some info while running
)

Step 3: Run the topic model

We can now use this model to extract the topics from the data, we do this with the following command:

topics, probs = topic_model.fit_transform(data)