### Stemming

In [13]:
import nltk

# If you haven't already, download these packages:
'''
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
'''
from nltk.stem import WordNetLemmatizer

# Get the set of English stopwords from NLTK
from nltk.corpus import stopwords

text = """May the Force be with you.
    There's no place like home.
    I'm the king of the world!
    Carpe diem. 
    Elementary, my dear Watson.
    It's alive! 
    My mama always said life was like a box of chocolates. 
    I'll be back.
    """

# tokenize the sentences: 
'''
sent_tokenize is a function from the Natural Language Toolkit (NLTK) library in Python, used to split a given text 
into a list of individual sentences. It is a sentence tokenizer, meaning it identifies sentence boundaries in a text, 
even when the text contains complex structures like abbreviations, punctuation, or special characters.
'''
sentences = nltk.sent_tokenize(text)
print("Original text: \n", sentences)

# Initialize a Porter Stemmer
stemmer = nltk.PorterStemmer()

# Stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])  # tokenize the words in each sentence

    # Filter out stopwords and apply stemming to each word
    '''
    The 'for word in words' block of code can be replaced by this one line shortform
        stemmed_words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    '''
    stemmed_words = []
    for word in words:
        if word not in set(stopwords.words('english')):  # exclude stopwords
            stemmed_words.append(stemmer.stem(word))
    sentences[i] = ' '.join(stemmed_words)

print("\n\n Filtered and Stemmed Words: \n", sentences)


Original text: 
 ['May the Force be with you.', "There's no place like home.", "I'm the king of the world!", 'Carpe diem.', 'Elementary, my dear Watson.', "It's alive!", 'My mama always said life was like a box of chocolates.', "I'll be back."]


 Filtered and Stemmed Words: 
 ['may forc .', "there 's place like home .", "i 'm king world !", 'carp diem .', 'elementari , dear watson .', "it 's aliv !", 'my mama alway said life like box chocol .', "i 'll back ."]


### Lemmatization

In [8]:
import nltk

# Lemmatizer
from nltk.stem import WordNetLemmatizer

# Get the set of English stopwords from NLTK
from nltk.corpus import stopwords

text = """May the Force be with you.
    There's no place like home.
    I'm the king of the world!
    Carpe diem. 
    Elementary, my dear Watson.
    It's alive! 
    My mama always said life was like a box of chocolates. 
    I'll be back."""

# tokenize the sentences
sentences = nltk.sent_tokenize(text)
print("Original text:\n", sentences)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])  # tokenize the words in each sentence

    # Filter out stopwords and lemmatize each word
    lemma_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(lemma_words)

print("\n\n Filtered and Lemmatized Words: \n", sentences)

Original text:
 ['May the Force be with you.', "There's no place like home.", "I'm the king of the world!", 'Carpe diem.', 'Elementary, my dear Watson.', "It's alive!", 'My mama always said life was like a box of chocolates.', "I'll be back."]


 Filtered and Lemmatized Words: 
 ['May Force .', "There 's place like home .", "I 'm king world !", 'Carpe diem .', 'Elementary , dear Watson .', "It 's alive !", 'My mama always said life like box chocolate .', "I 'll back ."]
