In [3]:
# Download NLTK data if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[3]:
True

NLP Natural Language Processing¶

1.Tokenization¶

  • Tokens, Keywords, Sentences
In [4]:
#import nltk, word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize # this line is iport it initializes stuff
data = "All work and no play makes jack a dull boy, all work and no play"
words = word_tokenize(data)

print (words)
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']
In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
data = "All work and no play makes jack a dull boy. All work and no play."
words = sent_tokenize(data)
print (words)
['All work and no play makes jack a dull boy.', 'All work and no play.']

2. Model : creating bag of words¶

  • Steps 1. tokenize text, 2. create vocabulary, 3. count number of occurences, 4. Create the model
In [6]:
# 1. libraries
from keras.preprocessing.text import Tokenizer # define four docs

# 2. define corpus
docs = ["It was the best of times",
       "It was the worst of times",
       "It was the age of wisdom",
       "It was the age of foolishness"]

# 3. Tokenize corpus
# create tokenizer model
model = Tokenizer() # made some changes ici lacked - ()

# use model to create model
Tokens = model.fit_on_texts(docs)
print(Tokens)

# 4. Define bag of words
# display words
print (f'key : {list(model.word_index.keys())}')

# display frequency fro each word
rep = model.texts_to_matrix(docs, mode = 'count')
print(rep)
None
key : ['it', 'was', 'the', 'of', 'times', 'age', 'best', 'worst', 'wisdom', 'foolishness']
[[0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0.]
 [0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0.]
 [0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1.]]

stemming¶

  • removes prefixes and suffixes on a word to remain with only a stem, this increases accuracy
In [11]:
# libraries 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

# use lib creates stemming obj
ps = PorterStemmer()

# example - words to stem
words = ["programs", "programming", "Terrors", "Terrorists", "Terrorism", "Modelling", "Networking"]

# apply stemming objects to extract base words
for w in words:
    print(w, ":", ps.stem(w))
programs : program
programming : program
Terrors : terror
Terrorists : terrorist
Terrorism : terror
Modelling : model
Networking : network

Lemmatization¶

  • it extracts a word as it would appear in the dictionary from a conjugated word etc
In [8]:
# libraries 
from nltk.stem import WordNetLemmatizer # installed wordnet

# define obj
wl = WordNetLemmatizer()

# use the object to lemmatize words
print("rocks :", wl.lemmatize("rocks"))
print("corpora :", wl.lemmatize("corpora"))
print("studies :", wl.lemmatize("studies")) # extra training word


# a denotes adjective in "pos"
print("better :", wl.lemmatize("better", pos = "a"))
print("worse :", wl.lemmatize("worse", pos = "a")) # extra training adjective
rocks : rock
corpora : corpus
studies : study
better : good
worse : bad

POS - part-of-speech tagging¶

  • defines the meaning of a sentence based on the context, extracts relations between words
In [9]:
# libraries
# input text
text = """Today morning. Arthur felt very good."""

# tokenize text into words 
tokens = nltk.word_tokenize(text)

# apply POS tagging Function
tagged = nltk.pos_tag(tokens)

# print agged tokens 
print(tagged)
[('Today', 'NN'), ('morning', 'NN'), ('.', '.'), ('Arthur', 'NNP'), ('felt', 'VBD'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]
In [ ]: