# Named Entity Recognition
## Get top 10 Organisations – for tweets filtered on ‘premier league’

In [1]:
import os
java_path = "C:/Program Files/Java/jdk-23/bin/java.exe"  # check that this is the correct location for java
os.environ['JAVAHOME'] = java_path

In [3]:
import urllib.request
import zipfile

'''
Download and extract stanford-ner.
Move the downloaded Stanford NER classifier file to a desired directory. 
'''
# download and extract stanford-ner
# urllib.request.urlretrieve(r'https://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip', 
#   r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20.zip')
# zfile = zipfile.ZipFile(r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20.zip')
# zfile.extractall(r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner')

from nltk.tag.stanford import StanfordNERTagger
# Set the direct path to the NER Tagger.
# Use english.all.3class.distsim (three class classifier) to find three classes of named entities.
_model_filename = r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20\classifiers\english.all.3class.distsim.crf.ser.gz'
_path_to_jar = r'C:\Delphi\Web\Marketing-Analytics\py\data\stanford-ner-2015-04-20\stanford-ner.jar'

# Initialize the NLTK's Stanford NER Tagger API with the DIRECT PATH to the model and .jar file.
st = StanfordNERTagger(model_filename=_model_filename, path_to_jar=_path_to_jar)

# st = StanfordNERTagger('/english.all.3class.distsim.crf.ser.gz') 
# st.tag(sentence.split())


In [7]:
# read the tweets
import pandas as pd
df = pd.read_json('data/tweets-statuses.json')  
df[:5]

Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweet_count,favorite_count,favorited,retweeted,lang,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status,extended_entities
0,2020-10-02 08:18:55+00:00,1311943733441683456,1311943733441683456,Such a misunderstanding that if you like footb...,True,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://about.twitter.com/products/tw...",,,...,0,0,False,False,en,,,,,
1,2020-10-02 08:18:54+00:00,1311943729159299073,1311943729159299072,£23.5 million for Brewster with a buy back cla...,True,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,,...,0,0,False,False,en,,,,,
2,2020-10-02 08:18:51+00:00,1311943716802883584,1311943716802883584,Forbes have revealed the world’s most valuable...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://publicize.wp.com/"" rel=""nofoll...",,,...,0,0,False,False,en,1.0,,,,
3,2020-10-02 08:18:33+00:00,1311943642869829638,1311943642869829632,For each premier league prediction you get rig...,True,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,0,0,False,False,en,0.0,,,,
4,2020-10-02 08:18:21+00:00,1311943589971124225,1311943589971124224,"Chennai Super Kings vs Sunrisers Hyderabad, 14...",True,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,0,0,False,False,en,0.0,,,,


In [9]:
'''
Iterate through tweets (list of tweets), 
(1) tag the named entities, and 
(2) extract and store only entities related to three classes – organization, person, location.
    The ‘O’ tag in the Stanford NER Tagger stands for “Outside”. It is used to mark words that 
    do not belong to any named entity class in the Named Entity Recognition (NER) process.
'''
entities = []
for twt in df['text']:  # for each tweet 
    lst_tags = st.tag(twt.split())  # split the tweet by whitespace, into words and tag them (see o/p lst_tags)
    for tup in lst_tags:  # for each (tag, word) tuple in lst_tags
        if(tup[1] != 'O'):  # exclude 'O' (Outside - do not belong to named entity class)
            entities.append(tup)
print("tagging completed")
print("entity extraction completed")

tagging completed
entity extraction completed


In [23]:
df['text']

0       Such a misunderstanding that if you like football, you have to watch the Premier League or La Liga or the Serie A o… https://t.co/rHEMv6fXQy
1       £23.5 million for Brewster with a buy back clause and sell on fee. He did very well at Swansea but Premier League i… https://t.co/2MD9M7Ivnb
2                               Forbes have revealed the world’s most valuable football clubs – and Premier League dominates https://t.co/etuRxZ9WFY
3       For each premier league prediction you get right is how much days you stay in my bio. An extra day if you guess the… https://t.co/1bvvDNibCN
4           Chennai Super Kings vs Sunrisers Hyderabad, 14th Match\n\nIndian Premier League 2020\n\nJoin telegram channel -… https://t.co/Ui4RskJRmB
                                                                           ...                                                                      
95      Good price, helps pay for Jota and Thiago. The goal isn’t to develop talent anymore. The goal is t

In [12]:
lst_tags[:20] # list of tags for the last tweet

[('Liverpool', 'ORGANIZATION'),
 ('are', 'O'),
 ('on', 'O'),
 ('the', 'O'),
 ('verge', 'O'),
 ('of', 'O'),
 ('selling', 'O'),
 ('Rhian', 'PERSON'),
 ('Brewster,', 'O'),
 ('a', 'O'),
 ('player', 'O'),
 ('who', 'O'),
 ('has', 'O'),
 ('yet', 'O'),
 ('to', 'O'),
 ('play', 'O'),
 ('a', 'O'),
 ('Premier', 'O'),
 ('League', 'O'),
 ('game,', 'O')]

In [13]:
entities[:20]

[('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('La', 'ORGANIZATION'),
 ('Liga', 'ORGANIZATION'),
 ('Brewster', 'PERSON'),
 ('Swansea', 'LOCATION'),
 ('Forbes', 'PERSON'),
 ('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Chennai', 'ORGANIZATION'),
 ('Super', 'ORGANIZATION'),
 ('Kings', 'ORGANIZATION'),
 ('Premier', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Europa', 'ORGANIZATION'),
 ('League', 'ORGANIZATION'),
 ('Burnley', 'ORGANIZATION'),
 ('Southampton', 'ORGANIZATION'),
 ('Everton', 'ORGANIZATION'),
 ('@MrAncelotti', 'ORGANIZATION')]

In [15]:
# Load entity tuples to dataframe df_entities and name the columns “word” and “ner”
df_entities = pd.DataFrame(entities) 
df_entities.columns = ["word","ner"]
df_entities

Unnamed: 0,word,ner
0,Premier,ORGANIZATION
1,League,ORGANIZATION
2,La,ORGANIZATION
3,Liga,ORGANIZATION
4,Brewster,PERSON
...,...,...
186,Spurs,ORGANIZATION
187,Tottenham,ORGANIZATION
188,Hotspur,ORGANIZATION
189,Liverpool,ORGANIZATION


In [27]:
# Extract and count the most frequently mentioned organizations from df_entities

'''
Counter is a class from the collections module that counts the frequency of elements in a collection, 
like a list. It returns a dictionary-like object where keys are the elements, and values are their counts.
'''
from collections import Counter

# Filter df_entities to extract rows with NER = ORGANIZATION.
organizations = df_entities[df_entities['ner'].str.contains("ORGANIZATION")]

# top 10 Organisations: Get the top 10 most mentioned organizations.
cnt = Counter(organizations['word'])  # Creates a dictionary where keys are the words and values are their counts.

'''
Retrieves top 10 most common organizations (based on their frequency of appearance in the dataframe). 
Returns a list of tuples, where each tuple contains an organization and its count (e.g., ("Liverpool", 5)).
'''
cnt.most_common(10)  

[('League', 18),
 ('Premier', 15),
 ('Burnley', 6),
 ('Southampton', 5),
 ('Liverpool', 5),
 ('Tottenham', 4),
 ('United', 3),
 ('La', 2),
 ('Europa', 2),
 ('Sheffield', 2)]

In [25]:
liverpool_tweets = df[df['text'].str.contains('Liverpool')]
print(liverpool_tweets['text'], )

16               Liverpool have had over 60 million combined for solanke,ibe and now Brewster... all little to no premier league experience as well
18     Liverpool won the champions league and premier league back to back, this is after an uphill of consecutive wonderfu… https://t.co/W7kC1g8yzh
21      The extravagance of England's Premier League: Sheffield Utd are paying £23.5m to sign Liverpool's Rhian Brewster -… https://t.co/seadV3jygs
44     Really wanted this kid to make it at Liverpool but that's an amazing fee for a player yet to start in the premier L… https://t.co/gtoa01Qa15
69     This is a great deal. He’s never kicked a ball in the Premier League, but Liverpool are making a hefty sum on him a… https://t.co/7L8uscPCdT
92    Liverpool career: Premier League appearances: 0\nEurope: An all expenses paid trip to a Champions League final and w… https://t.co/XgnxldjLjp
99     Liverpool are on the verge of selling Rhian Brewster, a player who has yet to play a Premier League game,

In [48]:
# Filter df_entities to extract rows with NER = PERSON.
people = df_entities[df_entities['ner'].str.contains("PERSON")]

# top 10 People: Get the top 10 most mentioned people.
cnt = Counter(people['word'])
cnt.most_common(10)

[('Brewster', 10),
 ('Harry', 8),
 ('Kane', 6),
 ('Saliba', 4),
 ('Thiago', 4),
 ('Rhian', 4),
 ('Jose', 4),
 ('Mourinho', 4),
 ('Carlos', 4),
 ('Vinicius', 4)]

In [50]:
# Filter df_entities to extract rows with NER = LOCATION.
locations = df_entities[df_entities['ner'].str.contains("LOCATION")]

# top 5 Locations: Get the top 10 most mentioned locations.
cnt = Counter(locations['word'])
cnt.most_common(5)

[('Swansea', 2), ('Accra', 2), ('Liverpool', 2), ('West', 2), ('Ham', 2)]