# Load packages
import requests
import pymupdf
from textblob import TextBlob
# Download the data
url = "https://funnyengwish.wordpress.com/wp-content/uploads/2017/05/pratchett_terry_wyrd_sisters_-_royallib_ru.pdf"
response = requests.get(url)
# Extract data from pdf
data = response.content
doc = pymupdf.Document(stream=data)
# Create text from first pdf page
page1 = doc[0].get_text()
# Turn text into TextBlob
text = TextBlob(page1)Text normalization
TextBlob allows to transform text—something very useful in preparation for text analysis.
Case
There are methods to change the case of TextBlob objects.
For example, capitalization (let’s only print the first 1000 characters)
print(text.title()[:1000])
Terry Pratchett
Wyrd Sisters
(Starring Three Witches, Also Kings, Daggers, Crowns, Storms, Dwarfs, Cats, Ghosts, Spectres,
Apes, Bandits, Demons, Forests, Heirs, Jesters, Tortures, Trolls, Turntables, General Rejoicing And
Drivers Alarums.)
The Wind Howled. Lightning Stabbed At The Earth Erratically, Like An Inefficient Assassin.
Thunder Rolled Back And Forth Across The Dark, Rain-Lashed Hills.
The Night Was As Black As The Inside Of A Cat. It Was The Kind Of Night, You Could Believe, On
Which Gods Moved Men As Though They Were Pawns On The Chessboard Of Fate. In The Middle Of This
Elemental Storm A Fire Gleamed Among The Dripping Furze Bushes Like The Madness In A Weasel'S Eye.
It Illuminated Three Hunched Figures. As The Cauldron Bubbled An Eldritch Voice Shrieked: 'When
Shall We Three Meet Again?'
There Was A Pause.
Finally Another Voice Said, In Far More Ordinary Tones: 'Well, I Can Do Next Tuesday.'
Through The Fathomless Deeps Of Space Swims The Star Turt
Or transformation to upper case:
print(text.upper()[:1000])
TERRY PRATCHETT
WYRD SISTERS
(STARRING THREE WITCHES, ALSO KINGS, DAGGERS, CROWNS, STORMS, DWARFS, CATS, GHOSTS, SPECTRES,
APES, BANDITS, DEMONS, FORESTS, HEIRS, JESTERS, TORTURES, TROLLS, TURNTABLES, GENERAL REJOICING AND
DRIVERS ALARUMS.)
THE WIND HOWLED. LIGHTNING STABBED AT THE EARTH ERRATICALLY, LIKE AN INEFFICIENT ASSASSIN.
THUNDER ROLLED BACK AND FORTH ACROSS THE DARK, RAIN-LASHED HILLS.
THE NIGHT WAS AS BLACK AS THE INSIDE OF A CAT. IT WAS THE KIND OF NIGHT, YOU COULD BELIEVE, ON
WHICH GODS MOVED MEN AS THOUGH THEY WERE PAWNS ON THE CHESSBOARD OF FATE. IN THE MIDDLE OF THIS
ELEMENTAL STORM A FIRE GLEAMED AMONG THE DRIPPING FURZE BUSHES LIKE THE MADNESS IN A WEASEL'S EYE.
IT ILLUMINATED THREE HUNCHED FIGURES. AS THE CAULDRON BUBBLED AN ELDRITCH VOICE SHRIEKED: 'WHEN
SHALL WE THREE MEET AGAIN?'
THERE WAS A PAUSE.
FINALLY ANOTHER VOICE SAID, IN FAR MORE ORDINARY TONES: 'WELL, I CAN DO NEXT TUESDAY.'
THROUGH THE FATHOMLESS DEEPS OF SPACE SWIMS THE STAR TURT
Number
The number (singular/plural) of particular words can also be changed:
print(text.words[6])
print(text.words[6].singularize())--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[4], line 1 ----> 1 print(text.words[6]) 2 print(text.words[6].singularize()) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
print(text.words[42])
print(text.words[42].pluralize())--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[5], line 1 ----> 1 print(text.words[42]) 2 print(text.words[42].pluralize()) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Lemmatization
Lemmatization reduces all words to their lemma (dictionary or canonical form) so that inflected words such as “dog” and “dogs” aren’t counted in separate categories in analyses.
Nouns
The lemmatize method uses as its default argument "n" (for noun):
print(TextBlob("heirs").words[0].lemmatize())
print(TextBlob("daggers").words[0].lemmatize())--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[6], line 1 ----> 1 print(TextBlob("heirs").words[0].lemmatize()) 2 print(TextBlob("daggers").words[0].lemmatize()) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Be careful: you can’t always trust that TextBlob will work properly. It is a library very easy to use, but it has its limitations.
For instance, I am not sure why this one doesn’t work:
print(TextBlob("men").words[0].lemmatize())--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[7], line 1 ----> 1 print(TextBlob("men").words[0].lemmatize()) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
While this totally works:
print(TextBlob("policemen").words[0].lemmatize())--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[8], line 1 ----> 1 print(TextBlob("policemen").words[0].lemmatize()) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Using the more complex and more powerful NLTK Python library, you can implement the solution suggested here.
Verbs
To lemmatize verbs, you need to pass "v" (for verbs) to the lemmatize method:
print(TextBlob("seen").words[0].lemmatize("v"))
print(TextBlob("seeing").words[0].lemmatize("v"))
print(TextBlob("sees").words[0].lemmatize("v"))--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[9], line 1 ----> 1 print(TextBlob("seen").words[0].lemmatize("v")) 2 print(TextBlob("seeing").words[0].lemmatize("v")) 3 print(TextBlob("sees").words[0].lemmatize("v")) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Your turn:
Why is this one not working?
print(TextBlob("saw").words[0].lemmatize("v"))--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[10], line 1 ----> 1 print(TextBlob("saw").words[0].lemmatize("v")) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Examples from the text:
print(TextBlob("starring").words[0].lemmatize("v"))
print(TextBlob("stabbed").words[0].lemmatize("v"))
print(TextBlob("howled").words[0].lemmatize("v"))
print(TextBlob("rejoicing").words[0].lemmatize("v"))--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[11], line 1 ----> 1 print(TextBlob("starring").words[0].lemmatize("v")) 2 print(TextBlob("stabbed").words[0].lemmatize("v")) 3 print(TextBlob("howled").words[0].lemmatize("v")) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Adjectives
To lemmatize adjectives, you need to pass "a" (for adjectives) to the lemmatize method:
print(TextBlob("youngest").words[0].lemmatize("a"))--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[12], line 1 ----> 1 print(TextBlob("youngest").words[0].lemmatize("a")) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Correction
The correct method attempts to correct spelling mistakes:
print(TextBlob("Somethingg with speling mystakes").correct())Something with spelling mistakes
There are however limitations since the method is based on a lexicon and isn’t aware of the relationship between words (and thus cannot correct grammatical errors):
print(TextBlob("Some thingg with speling mystake").correct())Some things with spelling mistake
An example even more obvious:
print(TextBlob("He drink").correct())He drink