1
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\Users\ameyp\AppData\Roaming\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] C:\Users\ameyp\AppData\Roaming\nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n",
"[nltk_data] Downloading package omw-1.4 to\n",
"[nltk_data] C:\Users\ameyp\AppData\Roaming\nltk_data...\n",
"[nltk_data] Package omw-1.4 is already up-to-date!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"--- Tokenization ---\n",
"Whitespace Tokenization: ['Hello', 'there!', "I'm", 'testing', 'various', 'tokenization', 'methods:', 'whitespace,', 'punctuation-based,', 'treebank,', 'tweet', '&', 'MWE.']\n",
"Punctuation-based Tokenization: ['Hello', 'there', 'I', 'm', 'testing', 'various', 'tokenization', 'methods', 'whitespace', 'punctuation', 'based', 'treebank', 'tweet', 'MWE']\n",
"Treebank Tokenization: ['Hello', 'there', '!', 'I', "'m", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
"Tweet Tokenization: ['Hello', 'there', '!', "I'm", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
"MWE Tokenization: ['Hello', 'there', '!', 'I', "'m", 'testing_various', 'tokenization_methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
"\n",
"--- Stemming ---\n",
"Porter Stemming: ['hello', 'there', 'i', 'm', 'test', 'variou', 'token', 'method', 'whitespac', 'punctuat', 'base', 'treebank', 'tweet', 'mwe']\n",
"Snowball Stemming: ['hello', 'there', 'i', 'm', 'test', 'various', 'token', 'method', 'whitespac', 'punctuat', 'base', 'treebank', 'tweet', 'mwe']\n",
"\n",
"--- Lemmatization ---\n",
"Lemmatization: ['Hello', 'there', 'I', 'm', 'testing', 'various', 'tokenization', 'method', 'whitespace', 'punctuation', 'based', 'treebank', 'tweet', 'MWE']\n"
]
}
],
"source": [
"import nltk\n",
"from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer, WhitespaceTokenizer, RegexpTokenizer\n",
"from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer\n",
"\n",
"# Download necessary resources\n",
"nltk.download('punkt')\n",
"nltk.download('wordnet')\n",
"nltk.download('omw-1.4')\n",
"\n",
"text = "Hello there! I'm testing various tokenization methods: whitespace, punctuation-based, treebank, tweet & MWE."\n",
"\n",
"# Tokenizers\n",
"tokenizers = {\n",
" "Whitespace": WhitespaceTokenizer(),\n",
" "Punctuation-based": RegexpTokenizer(r'\w+'),\n",
" "Treebank": TreebankWordTokenizer(),\n",
" "Tweet": TweetTokenizer(),\n",
" "MWE": MWETokenizer([('testing', 'various'), ('tokenization', 'methods')])\n",
"}\n",
"\n",
"print("\n--- Tokenization ---")\n",
"tokens = {}\n",
"for name, tokenizer in tokenizers.items():\n",
" if name == "MWE":\n",
" tokens[name] = tokenizer.tokenize(word_tokenize(text))\n",
" else:\n",
" tokens[name] = tokenizer.tokenize(text)\n",
" print(f"{name} Tokenization: {tokens[name]}")\n",
"\n",
"# Use punctuation tokens for stemming and lemmatization\n",
"base_tokens = tokens["Punctuation-based"]\n",
"\n",
"print("\n--- Stemming ---")\n",
"for stemmer_name, stemmer in {\n",
" "Porter": PorterStemmer(),\n",
" "Snowball": SnowballStemmer("english")\n",
"}.items():\n",
" stems = [stemmer.stem(w) for w in base_tokens]\n",
" print(f"{stemmer_name} Stemming: {stems}")\n",
"\n",
"print("\n--- Lemmatization ---")\n",
"lemmatizer = WordNetLemmatizer()\n",
"lemmatized = [lemmatizer.lemmatize(w) for w in base_tokens]\n",
"print(f"Lemmatization: {lemmatized}")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}