Exhibit 25.31 demonstrates the Python implementation of Named Entity Recognition (NER) using Stanford NER to identify and classify entities into three categories — organization, person, location. The entites are sourced from tweets associated with the English Premier League.
The process involves the following steps:
tweets-statuses.json.
    Note the ambiguity arrising from words such as Manchester and Madrid which fall under multiple categories — location (Manchester, Madrid) and organization (Manchester United, Real Madrid). Manual cleaning is required in such cases to ensure that entities are correctly tagged.
import os
java_path = "C:/Program Files/Java/jdk-16.0.1/bin/java.exe"
os.environ['JAVAHOME'] = java_path
import urllib.request
import zipfile
from nltk.tag.stanford import StanfordNERTagger
# Set the direct path to the NER Tagger.
# Use english.all.3class.distsim (three class classifier) to find three classes of named entities.
_model_filename = r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20\classifiers\english.all.3class.distsim.crf.ser.gz'
_path_to_jar = r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20\stanford-ner.jar'
# Initialize the NLTK's Stanford NER Tagger API with the DIRECT PATH to the model and .jar file.
st = StanfordNERTagger(model_filename=_model_filename, path_to_jar=_path_to_jar)
entities = []
    
# read the tweets
import pandas as pd
df = pd.read_json('data/tweets-statuses.json')  
df[:5]
'''
Iterate through tweets list and 
(1) tag the named entities, and 
(2) extract and store only entities related to three classes – organization, person, location.
'''
for tweet in df['text']:
    # split the tweet by whitespace, into words, and tag them (see o/p lst_tags)
    lst_tags = st.tag(tweet.split())  
    for tup in lst_tags:  # for each (tag, word) tuple in lst_tags
        if(tup[1] != 'O'):  # exclude 'O' (Outside - do not belong to named entity class)
            entities.append(tup)
lst_tags # list of tags for the last tweet
[('Liverpool', 'ORGANIZATION'),
 ('are', 'O'),
 ('on', 'O'),
 ('the', 'O'),
 ('verge', 'O'),
 ('of', 'O'),
 ('selling', 'O'),
 ('Rhian', 'PERSON'),
 ('Brewster,', 'O'),
 ('a', 'O'),
 ('player', 'O'),
 ('who', 'O'),
 ('has', 'O'),
 ('yet', 'O'),
 ('to', 'O'),
 ('play', 'O'),
 ('a', 'O'),
 ('Premier', 'O'),
 ('League', 'O'),
 ('game,', 'O'),
 ('for', 'O'),
 ('£23.5…', 'O'),
 ('https:t.co6yuDSdxbjW', 'O')]
    
# Print the first 20 entities
entities[:20]
[('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('La', 'ORGANIZATION'),
 ('Liga', 'ORGANIZATION'),
 ('Brewster', 'PERSON'),
 ('Swansea', 'LOCATION'),
 ('Forbes', 'PERSON'),
 ('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Chennai', 'ORGANIZATION'),
 ('Super', 'ORGANIZATION'),
 ('Kings', 'ORGANIZATION'),
 ('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Europa', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Burnley', 'ORGANIZATION'),
 ('Southampton', 'ORGANIZATION'),
 ('Everton', 'ORGANIZATION'),
 ('@MrAncelotti', 'ORGANIZATION')]
    
# Load entity tuples to dataframe df_entities and name the columns “word” and “ner”
df_entities = pd.DataFrame(entities) 
df_entities.columns = ["word","ner"]
df_entities 
from collections import Counter
'''
Counter is a class from the collections module that counts the frequency of elements 
in a collection, like a list. It returns a dictionary-like object where keys are the 
elements, and values are their counts.
'''
# Filter df_entities to extract rows with NER = Organisations.
organizations = df_entities[df_entities['ner'].str.contains("ORGANIZATION")]
# top 10 Organisations: Get the top 10 most mentioned organizations.
cnt = Counter(organizations['word'])
cnt.most_common(10)
[('League', 18),
 ('Premier', 15),
 ('Burnley', 6),
 ('Southampton', 5),
 ('Liverpool', 5),
 ('Tottenham', 4),
 ('United', 3),
 ('La', 2),
 ('Europa', 2),
 ('Sheffield', 2)]
    
# Filter df_entities to extract rows with NER = PERSON.
people = df_entities[df_entities['ner'].str.contains("PERSON")]
# top 10 Persons: Get the top 10 most mentioned persons.
cnt = Counter(people['word'])
cnt.most_common(10)
[('Brewster', 5),
 ('Harry', 4),
 ('Kane', 3),
 ('Saliba', 2),
 ('Thiago', 2),
 ('Rhian', 2),
 ('Jose', 2),
 ('Mourinho', 2),
 ('Carlos', 2),
 ('Vinicius', 2)]
    
# Filter df_entities to extract rows with NER = LOCATION.
locations = df_entities[df_entities['ner'].str.contains("LOCATION")]
# top 5 Locations: Get the top 10 most mentioned locations.
cnt = Counter(locations['word'])
cnt.most_common(5)
[('Swansea', 1), ('Accra', 1), ('Liverpool', 1), ('West', 1), ('Ham', 1)]
    
# Extract the tweets containing the organization ‘Liverpool’.
liverpool_tweets = df[df['text'].str.contains('Liverpool')]
print(liverpool_tweets['text'])
16 Liverpool have had over 60 million combined fo... 18 Liverpool won the champions league and premier... 21 The extravagance of England's Premier League: ... 44 Really wanted this kid to make it at Liverpool... 69 This is a great deal. He’s never kicked a ball... 92 Liverpool career: Premier League appearances: ... 99 Liverpool are on the verge of selling Rhian Br... Name: text, dtype: object
Use the Search Bar to find content on MarketingMind.
Contact | Privacy Statement | Disclaimer: Opinions and views expressed on www.ashokcharan.com are the author’s personal views, and do not represent the official views of the National University of Singapore (NUS) or the NUS Business School | © Copyright 2013-2025 www.ashokcharan.com. All Rights Reserved.
 
        
         
                     
                     
                     
                    