install.packages("reticulate")
library(reticulate)
# for windows users, install GITNamed Entity Recognition
We will used pre-trained models to perform named entity recognition. These models are quite heavy and require python/spark/torch/hadoop infrastructures. In case these do not work we will have a basic version (tagged but not trained) that will always work.
Now we install the models and perform NER!
Now we can start downloading the models. Let’s start by loading a bert model trained for NER. We add the argument aggregation_strategy= “simple to get an output that inclused readable tokens and not a list of syllable level tokens (cf. slides from the lecture). For NER, we will use the following model: bert-base-NER. If you have more patience and computation power bert-large-NER is also an option. The hf_load_pipeline will download the models directly and put them to use.
NER_extract <- huggingfaceR::hf_load_pipeline(model = "dslim/bert-large-NER", task = "ner", aggregation_strategy = "simple")Let’s get a sample text and try it out.
text <- c("The 2024 edition of The European 5G Conference will take place on 30-31 January at the Hotel nhow Brussels Bloom. Now, in its 8th year, the conference has an established reputation as Brussels’ leading meeting place for discussion on 5G policy. Registration is now available – secure your place today. The event will, once again, provide the opportunity to hear from high-level policymakers and industry stakeholders on key themes such as investment, security, sustainability, emerging business models, and connectivity. It will provide an update on progress that has been made towards the 2030 ‘Path to the Digital Decade’ targets, as well as offering a first opportunity to examine the outcomes from WRC-23 and at what this may mean for the future connectivity environment around 5G and future technologies. By looking back at the lessons learnt to date and forward to the path towards 5G Advanced and 6G, the event will provide a comprehensive insight into all the key policy aspects that are shaping the 5G ecosystem in Europe.")
extracted_NE <- NER_extract(text)
#transform output into something readable:
extracted_NE <- plyr::ldply(extracted_NE, data.frame)
extracted_NEWe can do the same with a different model that is capable of treating multiple languages. The basic structure is the same. We change the links to the models, and adjust what we download (tensor or pytorch):
multilanguage_NER = huggingfaceR::hf_load_pipeline(model = "Babelscape/wikineural-multilingual-ner", tokenizer = "Babelscape/wikineural-multilingual-ner", task = "ner", aggregation_strategy="simple")
test_multi <- multilanguage_NER(text)
test_multi <- plyr::ldply(test_multi, data.frame)
test_multiAnd now Sentiment analysis
The workflow is the same, we just need to find the correct models:
sentiment_classifier <- huggingfaceR::hf_load_pipeline(model = "lxyuan/distilbert-base-multilingual-cased-sentiments-student", tokenizer = "lxyuan/distilbert-base-multilingual-cased-sentiments-student", top_k = 4)
sent <- sentiment_classifier(text)
sent <- plyr::ldply(sent, data.frame)
sentother type of model:
# other type
subj_classifier = huggingfaceR::hf_load_pipeline(model = "cffl/bert-base-styleclassification-subjective-neutral", tokenizer = "cffl/bert-base-styleclassification-subjective-neutral", task="text-classification")
subj <- subj_classifier(text)
subj <- plyr::ldply(subj, data.frame)
subjApply this to our data
The previous sections have shown how to use NER and sentiment analysis on a text. We now need to run this over a corpus. This means building a function that will allow us to automate the execution of the tasks.
Start by importing the lexis uni data used in the previous tutorial. For this tutorial, this data is stored in the “LN_dataframe” object. If this takes too long to run, you can always change the NER system. For a faster system use the “dslim/bert-base-NER” model at the beginning.
load("LN_dataframe.rdata")
# the text we want to analyse is in the "Article" column
NER_function = function(data, score_thresh){
data = as.data.frame(data)
# perform NER on the chosen column
output <- multilanguage_NER(as.character(data))
# organise the result in a df
output <- plyr::ldply(output, data.frame)
# subset according to threshold
output <- subset(output, output$score >= score_thresh)
# this gives us a dataframe will all identified objects
# we need to regroup them by type (ORG, PER, MISC, LOC)
output_df <- data.frame("LOC" = paste(subset(output, output$entity_group == "LOC")$word, collapse = ";"),
"ORG" = paste(subset(output, output$entity_group == "ORG")$word, collapse = ";"),
"PER" = paste(subset(output, output$entity_group == "PER")$word, collapse = ";"),
"MISC" = paste(subset(output, output$entity_group == "MISC")$word, collapse = ";")
)
# return the output dataframe (technically not needed here)
return(output_df)
} # closes function
NER_results = apply(LN_dataframe, 1, NER_function, score_thresh = 0)
NER_results <- plyr::ldply(NER_results, data.frame)
# this can take a while so we save the results
save(NER_results, file = "NER_results.rdata")If your computer is a bit slower (or you want to run this a little bit at a time), you can also use a loop:
# we start by adding the columns we want to add
LN_dataframe_NER <- LN_dataframe %>% mutate("LOC" = NA, "ORG" = NA, "PER" = NA, "MISC"= NA)
for(i in 1:dim(LN_dataframe)[1]){
if(i%%50 == 0){print(i)}
output <- multilanguage_NER(as.character(LN_dataframe_NER[i,11]))
# organise the result in a df
output <- plyr::ldply(output, data.frame)
# subset according to threshold
output <- subset(output, output$score >= 0)
# this gives us a dataframe will all identified objects
# we need to regroup them by type (ORG, PER, MISC, LOC)
output_df <- data.frame("LOC" = paste(subset(output, output$entity_group == "LOC")$word, collapse = ";"),
"ORG" = paste(subset(output, output$entity_group == "ORG")$word, collapse = ";"),
"PER" = paste(subset(output, output$entity_group == "PER")$word, collapse = ";"),
"MISC" = paste(subset(output, output$entity_group == "MISC")$word, collapse = ";")
)
LN_dataframe_NER[i,12:15] <- output_df
}
save(LN_dataframe_NER, file = "LN_dataframe_NER.rdata")Once you have all the results in the dataframe, you can start the analysis with the tools you learned in previous tutorials.
Connecting Topics to the initial dataframe
In the previous tutorial we extracted topics, now let’s connect the topics to the dataframe so that we know which document connects to which topic.
most_likely_topics <- topics(topics_model)# if you want THE most likely topic
most_likely_topics <- topics(topics_model, k = 3) # if you want the 3 most likely
# organise this data a bit
# when requesting more than one topic, you need to transpose the df:
most_likely_topics <- t(most_likely_topics)