Using NLTK for Natural Language Processing

Fun with NTLK

Last update:

This document was compiled from various sources by Tomonori Nagano ([email protected]). Please let me know if you find any error.

Install NLTK

Installing Python/NLTK

NLTK is written in Python
To use NLTK you will need, Python3.6 or higher. See https://realpython.com/installing-python/ for more information about Python.
See https://www.nltk.org/install.html for how to install NTLK (and its dependent packages such as numpy)
See https://www.nltk.org/data.html for the NLTK data (corpora)


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Checking Python and NLTK
# Date: Saturday, December 25, 2021
# Tomonori Nagano 

import platform

print(platform.python_version())
# 3.9.9

Python Basics

Working directory of Python


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Showing paths
# Date: Tuesday, December 21, 2021
# Tomonori Nagano 

import os

# this will show where your current path
print("Showing the current path:")
print(os.path.abspath('.'))
# /Users/home
print(os.getcwd())
# /Users/home

print("Showing the path to the Python code:")
print(os.path.abspath(__file__))
# /Users/home/ScriptDoc

print("Showing the files in the current path:")
print(os.listdir('.'))
# ['Music', 'Pictures', 'Desktop', 'Library', ...]

# alternatively, you can append a path to pythonpath
import sys
# change the argument to your own path
sys.path.append("/Volumes/USB/mypath/")

String operations


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: A simple demonstration of methods for string
# Date: Tuesday, December 21, 2021
# Tomonori Nagano 

name = 'Tomonori Nagano'
keyword = 'Tomo'

if name.startswith(keyword): 
	print('Tomonori starts with ' + keyword)
# Tomonori starts with Tomo

if name.endswith(keyword): 
	print('Tomonori ends with ' + keyword)
else:
	print('Tomonori does not end with ' + keyword)	
# Tomonori does not end with Tomo

if keyword in name: 
	print('Yes, ' + name + ' contains the string "' + keyword + '"') 
# Yes, Tomonori Nagano contains the string "Tomo"

print(name[0:4]) # printing the first four (index 0 to 3) letters
# Tomo
print(name[4:])  # printing the after the fourth character
# nori Nagano

print(name.capitalize()) # capitalize the first character
# Tomonori nagano
print(name.title())      # capitalize the first character of each word
# Tomonori Nagano
print(name.upper())      # capitalize all characters
# TOMONORI NAGANO
print(name.lower())      # change all characters lower case
# tomonori nagano

print("My name is %s. I have $%.2f' " % (name, 20))
# My name is Tomonori Nagano. I have $20.00' 
print("My name is {0}. I have ${1}' ".format(name, 20))
# My name is Tomonori Nagano. I have $20' 
print("My name is {name}. I have ${money}' ".format(name='Tomonori Nagano', money = 20))
# My name is Tomonori Nagano. I have $20' 
print("My name is {name:<20s}. I have ${money:.2f}' ".format(name='Tomonori Nagano', money = 20))
# My name is Tomonori Nagano     . I have $20.00' 
print("My name is {name:>20s}. I have ${money:05d}' ".format(name='Tomonori Nagano', money = 20))
# My name is      Tomonori Nagano. I have $00020'

NLTK Basics

Using frequency distribution (FreqDist)


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Demonstration of various basic operations in NLTK
# Date: Wednesday, December 22, 2021
# Tomonori Nagano 

import nltk
from nltk.corpus import gutenberg,brown
from nltk.probability import FreqDist, ConditionalFreqDist

thisFD = FreqDist(brown.words())
for word in list(thisFD)[:10]:
	print (word, thisFD[word])

for word, freq in thisFD.most_common()[:10]:
	print (word, freq)

Using conditional frequency (ConditionalFreqDist)


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Demonstration conditional frequency (ConditionalFreqDist)
# Date: Wednesday, December 22, 2021
# Tomonori Nagano 

import nltk
from nltk.corpus import gutenberg,brown
from nltk.probability import FreqDist, ConditionalFreqDist
from operator import itemgetter


# POS analyses (conditional frequency)
thisFD = FreqDist()
thisCondFD = ConditionalFreqDist()
for text in brown.fileids():
	for sentence in brown.tagged_sents(text):
		for (token, tag) in sentence:
			thisFD[tag] += 1
			thisCondFD[token][tag] += 1

for pos in thisCondFD['light']:
	print(pos, thisCondFD['light'][pos])

Using Porter stemmer


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Demonstrating Porter stemmer
# Date: Friday, December 24, 2021
# Tomonori Nagano 

import nltk

# Porter's stemmer
stemmer = nltk.PorterStemmer()
stemmer.stem('appearance')
verbs = ['appears', 'appear', 'appeared', 'appearing', 'appearance']
print(list(map(stemmer.stem, verbs)))

Finding collocations


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Demonstration of various basic operations in NLTK
# Date: Wednesday, December 22, 2021
# Tomonori Nagano 

import nltk
from nltk.corpus import gutenberg,brown
from nltk.probability import FreqDist, ConditionalFreqDist
from operator import itemgetter

# finding collocations
def collocations(words):
	# Count the words and bigrams
	wfd = nltk.FreqDist(words)
	pfd = nltk.FreqDist(tuple(words[i:i+2]) for i in range(len(words)-1))

	# score them
	scored = [((w1,w2), score(w1, w2, wfd, pfd)) for w1, w2 in pfd]
	scored.sort(key=itemgetter(1), reverse=True)
	return list(map(itemgetter(0), scored))

def score(word1, word2, wfd, pfd, power=3):
	freq1 = wfd[word1]
	freq2 = wfd[word2]
	freq12 = pfd[(word1, word2)]
	return freq12 ** power / float(freq1 * freq2)

words = [word.lower() for word in gutenberg.words('chesterton-brown.txt') if len(word) > 2]
print([word1 + ' ' + word2 for word1, word2 in collocations(words)[:15]])

NLTK Misc

Demonstrating Benford's Law


#!/opt/homebrew/bin/python3
# -*- coding: utf-8 -*-
# Description: Demonstrating Benford's law
# Date: Friday, December 24, 2021
# Tomonori Nagano 
		
import nltk
from nltk.corpus import reuters
import re
# nltk.download('reuters')

text = reuters.words()
thisFD = nltk.FreqDist()

for word in reuters.words():
	thisRegex = re.search('([1-9])\d*', word, re.IGNORECASE)
	if thisRegex: thisFD[thisRegex.group(1)] += 1

for word in thisFD:
	print (word,thisFD[word])