# Download NLTK data if not already downloaded
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\PC\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\PC\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\PC\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
#import nltk, word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize # this line is iport it initializes stuff
data = "All work and no play makes jack a dull boy, all work and no play"
words = word_tokenize(data)
print (words)
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']
from nltk.tokenize import sent_tokenize, word_tokenize
data = "All work and no play makes jack a dull boy. All work and no play."
words = sent_tokenize(data)
print (words)
['All work and no play makes jack a dull boy.', 'All work and no play.']
# 1. libraries
from keras.preprocessing.text import Tokenizer # define four docs
# 2. define corpus
docs = ["It was the best of times",
"It was the worst of times",
"It was the age of wisdom",
"It was the age of foolishness"]
# 3. Tokenize corpus
# create tokenizer model
model = Tokenizer() # made some changes ici lacked - ()
# use model to create model
Tokens = model.fit_on_texts(docs)
print(Tokens)
# 4. Define bag of words
# display words
print (f'key : {list(model.word_index.keys())}')
# display frequency fro each word
rep = model.texts_to_matrix(docs, mode = 'count')
print(rep)
None key : ['it', 'was', 'the', 'of', 'times', 'age', 'best', 'worst', 'wisdom', 'foolishness'] [[0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.] [0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0.] [0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0.] [0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1.]]
# libraries
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
# use lib creates stemming obj
ps = PorterStemmer()
# example - words to stem
words = ["programs", "programming", "Terrors", "Terrorists", "Terrorism", "Modelling", "Networking"]
# apply stemming objects to extract base words
for w in words:
print(w, ":", ps.stem(w))
programs : program programming : program Terrors : terror Terrorists : terrorist Terrorism : terror Modelling : model Networking : network
# libraries
from nltk.stem import WordNetLemmatizer # installed wordnet
# define obj
wl = WordNetLemmatizer()
# use the object to lemmatize words
print("rocks :", wl.lemmatize("rocks"))
print("corpora :", wl.lemmatize("corpora"))
print("studies :", wl.lemmatize("studies")) # extra training word
# a denotes adjective in "pos"
print("better :", wl.lemmatize("better", pos = "a"))
print("worse :", wl.lemmatize("worse", pos = "a")) # extra training adjective
rocks : rock corpora : corpus studies : study better : good worse : bad
# libraries
# input text
text = """Today morning. Arthur felt very good."""
# tokenize text into words
tokens = nltk.word_tokenize(text)
# apply POS tagging Function
tagged = nltk.pos_tag(tokens)
# print agged tokens
print(tagged)
[('Today', 'NN'), ('morning', 'NN'), ('.', '.'), ('Arthur', 'NNP'), ('felt', 'VBD'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]