# Load packages
import requests
import pymupdf
# Download the data
url = "https://funnyengwish.wordpress.com/wp-content/uploads/2017/05/pratchett_terry_wyrd_sisters_-_royallib_ru.pdf"
response = requests.get(url)
# Extract data from pdf
data = response.content
doc = pymupdf.Document(stream=data)
# Create text from first pdf page
page1 = doc[0].get_text()Text processing
In this section, we will use the TextBlob package for part of speech tagging and basic tokenization.
TextBlob
TextBlob is the NLP package that we will use in this course for tagging, tokenization, normalization, and sentiment analysis.
We first need to load it in our session:
from textblob import TextBlobBefore we can use TextBlob on our text, we need to convert the page1 string into a TextBlob object:
text = TextBlob(page1)
type(text)textblob.blob.TextBlob
Part of speech tagging
Part of speech tagging attributes parts of speech (POS) tags to each word of a text.
You can do this simply by using the tags property on a TextBlob object: text.tags. Because there are a lot of words in the first pdf page, this would create a very long output.
The result is a list:
type(text.tags)--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[4], line 1 ----> 1 type(text.tags) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:503, in BaseBlob.pos_tags(self) 484 """Returns an list of tuples of the form (word, POS tag). 485 486 Example: (...) 498 :rtype: list of tuples 499 """ 500 if isinstance(self, TextBlob): 501 return [ 502 val --> 503 for sublist in [s.pos_tags for s in self.sentences] 504 for val in sublist 505 ] 506 else: 507 return [ 508 (Word(str(word), pos_tag=t), str(t)) 509 for word, t in self.pos_tagger.tag(self) 510 if not PUNCTUATION_REGEX.match(str(t)) 511 ] File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
And each element of the list is a tuple:
type(text.tags[0])--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[5], line 1 ----> 1 type(text.tags[0]) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:503, in BaseBlob.pos_tags(self) 484 """Returns an list of tuples of the form (word, POS tag). 485 486 Example: (...) 498 :rtype: list of tuples 499 """ 500 if isinstance(self, TextBlob): 501 return [ 502 val --> 503 for sublist in [s.pos_tags for s in self.sentences] 504 for val in sublist 505 ] 506 else: 507 return [ 508 (Word(str(word), pos_tag=t), str(t)) 509 for word, t in self.pos_tagger.tag(self) 510 if not PUNCTUATION_REGEX.match(str(t)) 511 ] File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
We don’t have to print the full list. Let’s only print the first 20 tuples:
text.tags[:20]--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[6], line 1 ----> 1 text.tags[:20] File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:503, in BaseBlob.pos_tags(self) 484 """Returns an list of tuples of the form (word, POS tag). 485 486 Example: (...) 498 :rtype: list of tuples 499 """ 500 if isinstance(self, TextBlob): 501 return [ 502 val --> 503 for sublist in [s.pos_tags for s in self.sentences] 504 for val in sublist 505 ] 506 else: 507 return [ 508 (Word(str(word), pos_tag=t), str(t)) 509 for word, t in self.pos_tagger.tag(self) 510 if not PUNCTUATION_REGEX.match(str(t)) 511 ] File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Noun phrases extraction
Noun phrases can be extracted with the noun_phrases property:
print(text.noun_phrases)--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:84, in LazyCorpusLoader.__load(self) 83 try: ---> 84 root = nltk.data.find(f"{self.subdir}/{zip_name}") 85 except LookupError: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource brown not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('brown') For more information see: https://www.nltk.org/data.html Attempted to load corpora/brown.zip/brown/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** During handling of the above exception, another exception occurred: LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/en/np_extractors.py:113, in FastNPExtractor.train(self) 111 @requires_nltk_corpus 112 def train(self): --> 113 train_data = nltk.corpus.brown.tagged_sents(categories="news") 114 regexp_tagger = nltk.RegexpTagger( 115 [ 116 (r"^-?[0-9]+(.[0-9]+)?$", "CD"), (...) 128 ] 129 ) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:120, in LazyCorpusLoader.__getattr__(self, attr) 118 raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") --> 120 self.__load() 121 # This looks circular, but its not, since __load() changes our 122 # __class__ to something new: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:86, in LazyCorpusLoader.__load(self) 85 except LookupError: ---> 86 raise e 88 # Load the corpus. File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:81, in LazyCorpusLoader.__load(self) 80 try: ---> 81 root = nltk.data.find(f"{self.subdir}/{self.__name}") 82 except LookupError as e: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource brown not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('brown') For more information see: https://www.nltk.org/data.html Attempted to load corpora/brown Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[7], line 1 ----> 1 print(text.noun_phrases) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:477, in BaseBlob.noun_phrases(self) 471 @cached_property 472 def noun_phrases(self): 473 """Returns a list of noun phrases for this blob.""" 474 return WordList( 475 [ 476 phrase.strip().lower() --> 477 for phrase in self.np_extractor.extract(self.raw) 478 if len(phrase) > 1 479 ] 480 ) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/en/np_extractors.py:143, in FastNPExtractor.extract(self, sentence) 141 """Return a list of noun phrases (strings) for body of text.""" 142 if not self._trained: --> 143 self.train() 144 tokens = self._tokenize_sentence(sentence) 145 tagged = self.tagger.tag(tokens) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
The output is a WordList object:
type(text.noun_phrases)--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:84, in LazyCorpusLoader.__load(self) 83 try: ---> 84 root = nltk.data.find(f"{self.subdir}/{zip_name}") 85 except LookupError: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource brown not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('brown') For more information see: https://www.nltk.org/data.html Attempted to load corpora/brown.zip/brown/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** During handling of the above exception, another exception occurred: LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/en/np_extractors.py:113, in FastNPExtractor.train(self) 111 @requires_nltk_corpus 112 def train(self): --> 113 train_data = nltk.corpus.brown.tagged_sents(categories="news") 114 regexp_tagger = nltk.RegexpTagger( 115 [ 116 (r"^-?[0-9]+(.[0-9]+)?$", "CD"), (...) 128 ] 129 ) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:120, in LazyCorpusLoader.__getattr__(self, attr) 118 raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") --> 120 self.__load() 121 # This looks circular, but its not, since __load() changes our 122 # __class__ to something new: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:86, in LazyCorpusLoader.__load(self) 85 except LookupError: ---> 86 raise e 88 # Load the corpus. File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/corpus/util.py:81, in LazyCorpusLoader.__load(self) 80 try: ---> 81 root = nltk.data.find(f"{self.subdir}/{self.__name}") 82 except LookupError as e: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource brown not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('brown') For more information see: https://www.nltk.org/data.html Attempted to load corpora/brown Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[8], line 1 ----> 1 type(text.noun_phrases) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:477, in BaseBlob.noun_phrases(self) 471 @cached_property 472 def noun_phrases(self): 473 """Returns a list of noun phrases for this blob.""" 474 return WordList( 475 [ 476 phrase.strip().lower() --> 477 for phrase in self.np_extractor.extract(self.raw) 478 if len(phrase) > 1 479 ] 480 ) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/en/np_extractors.py:143, in FastNPExtractor.extract(self, sentence) 141 """Return a list of noun phrases (strings) for body of text.""" 142 if not self._trained: --> 143 self.train() 144 tokens = self._tokenize_sentence(sentence) 145 tagged = self.tagger.tag(tokens) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Tokenization
Words
TextBlob allows to extract words easily with the words attribute:
print(text.words)--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[9], line 1 ----> 1 print(text.words) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Your turn:
How many words are there in the first pdf page of Wyrd Sisters?
Sentences
Extracting sentences is just as easy with the sentences attribute.
Let’s extract the first 10 sentences:
text.sentences[:10]--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[10], line 1 ----> 1 text.sentences[:10] File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
The output is however quite ugly. We could make this a lot more readable by printing each sentence separated by a blank line:
for s in text.sentences[:10]:
print(s)
print("\n")--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[11], line 1 ----> 1 for s in text.sentences[:10]: 2 print(s) 3 print("\n") File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
In Python strings (as in many other languages), "\n" represents a new line.
Or you could add lines of hyphens between the sentences:
for s in text.sentences[:10]:
print(s)
print("-" * 100)--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[12], line 1 ----> 1 for s in text.sentences[:10]: 2 print(s) 3 print("-" * 100) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:615, in TextBlob.sentences(self) 612 @cached_property 613 def sentences(self): 614 """Return list of :class:`Sentence <Sentence>` objects.""" --> 615 return self._create_sentence_objects() File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:658, in TextBlob._create_sentence_objects(self) 656 """Returns a list of Sentence objects from the raw text.""" 657 sentence_objects = [] --> 658 sentences = sent_tokenize(self.raw) 659 char_index = 0 # Keeps track of character index within the blob 660 for sent in sentences: 661 # Compute the start and end indices of the sentence 662 # within the blob File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Your turn:
- What is the type of
text.sentences?
- Could you print just the 5th sentence?
- Just the last sentence?
Word counts
We already saw that we can extract words with the words attribute. Now, we can add the count method to get the frequency of specific words.
text.words.count("gods")--------------------------------------------------------------------------- LookupError Traceback (most recent call last) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:35, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 34 try: ---> 35 return func(*args, **kwargs) 36 except LookupError as error: File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:60, in SentenceTokenizer.tokenize(self, text) 59 """Return a list of sentences.""" ---> 60 return nltk.tokenize.sent_tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:119, in sent_tokenize(text, language) 110 """ 111 Return a sentence-tokenized copy of *text*, 112 using NLTK's recommended sentence tokenizer (...) 117 :param language: the model name in the Punkt corpus 118 """ --> 119 tokenizer = _get_punkt_tokenizer(language) 120 return tokenizer.tokenize(text) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/__init__.py:105, in _get_punkt_tokenizer(language) 98 """ 99 A constructor for the PunktTokenizer that utilizes 100 a lru cache for performance. (...) 103 :type language: str 104 """ --> 105 return PunktTokenizer(language) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1744, in PunktTokenizer.__init__(self, lang) 1743 PunktSentenceTokenizer.__init__(self) -> 1744 self.load_lang(lang) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/tokenize/punkt.py:1749, in PunktTokenizer.load_lang(self, lang) 1747 from nltk.data import find -> 1749 lang_dir = find(f"tokenizers/punkt_tab/{lang}/") 1750 self._params = load_punkt_params(lang_dir) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/nltk/data.py:579, in find(resource_name, paths) 578 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 579 raise LookupError(resource_not_found) LookupError: ********************************************************************** Resource punkt_tab not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('punkt_tab') For more information see: https://www.nltk.org/data.html Attempted to load tokenizers/punkt_tab/english/ Searched in: - '/home/marie/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/share/nltk_data' - '/home/marie/parvus/prog/mint/python/.venv/lib/nltk_data' - '/usr/share/nltk_data' - '/usr/local/share/nltk_data' - '/usr/lib/nltk_data' - '/usr/local/lib/nltk_data' ********************************************************************** The above exception was the direct cause of the following exception: MissingCorpusError Traceback (most recent call last) Cell In[13], line 1 ----> 1 text.words.count("gods") File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:23, in cached_property.__get__(self, obj, cls) 21 if obj is None: 22 return self ---> 23 value = obj.__dict__[self.func.__name__] = self.func(obj) 24 return value File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/blob.py:625, in TextBlob.words(self) 617 @cached_property 618 def words(self): 619 """Return a list of word tokens. This excludes punctuation characters. 620 If you want to include punctuation characters, access the ``tokens`` 621 property. 622 623 :returns: A :class:`WordList <WordList>` of word tokens. 624 """ --> 625 return WordList(word_tokenize(self.raw, include_punc=False)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/tokenizers.py:77, in word_tokenize(text, include_punc, *args, **kwargs) 69 def word_tokenize(text, include_punc=True, *args, **kwargs): 70 """Convenience function for tokenizing text into words. 71 72 NOTE: NLTK's word tokenizer expects sentences as input, so the text will be 73 tokenized to sentences before being tokenized to words. 74 """ 75 words = chain.from_iterable( 76 _word_tokenizer.itokenize(sentence, include_punc, *args, **kwargs) ---> 77 for sentence in sent_tokenize(text) 78 ) 79 return words File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/base.py:68, in BaseTokenizer.itokenize(self, text, *args, **kwargs) 61 def itokenize(self, text, *args, **kwargs): 62 """Return a generator that generates tokens "on-demand". 63 64 .. versionadded:: 0.6.0 65 66 :rtype: generator 67 """ ---> 68 return (t for t in self.tokenize(text, *args, **kwargs)) File ~/parvus/prog/mint/python/.venv/lib/python3.13/site-packages/textblob/decorators.py:37, in requires_nltk_corpus.<locals>.decorated(*args, **kwargs) 35 return func(*args, **kwargs) 36 except LookupError as error: ---> 37 raise MissingCorpusError() from error MissingCorpusError: Looks like you are missing some required data for this feature. To download the necessary data, simply run python -m textblob.download_corpora or use the NLTK downloader to download the missing data: http://nltk.org/data.html If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.