Module ktrain.text.kw.core
Expand source code
import warnings
from collections import Counter
from ... import imports as I
from .. import textutils as TU
try:
import textblob
TEXTBLOB_INSTALLED = True
except ImportError:
TEXTBLOB_INSTALLED = False
SUPPORTED_LANGS = {
"en": "english",
"ar": "arabic",
"az": "azerbaijani",
"da": "danish",
"nl": "dutch",
"fi": "finnish",
"fr": "french",
"de": "german",
"el": "greek",
"hu": "hungarian",
"id": "indonesian",
"it": "italian",
"kk": "kazakh",
"ne": "nepali",
"no": "norwegian",
"pt": "portuguese",
"ro": "romanian",
"ru": "russian",
"sl": "slovene",
"es": "spanish",
"sv": "swedish",
"tg": "tajik",
"tr": "turkish",
"zh": "chinese",
}
class KeywordExtractor:
"""
Keyphrase Extraction
"""
def __init__(
self,
lang="en",
custom_stopwords=["et al", "et", "al", "n't", "did", "does", "lt", "gt", "br"],
):
"""
```
Keyphrase Extraction
Args:
lang(str): 2-character language code:
custom_stopwords(list): list of custom stopwords to ignore
```
"""
# error checks
if not TEXTBLOB_INSTALLED:
raise Exception(
"The textblob package is required for keyphrase extraction: pip install textblob; python -m textblob.download_corpora"
)
if lang not in SUPPORTED_LANGS:
raise ValueError(
f'lang="{lang}" is not supported. Supported 2-character ISO 639-1 language codes are: {SUPPORTED_LANGS}'
)
self.lang = lang
# build blacklist
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
if lang == "en":
stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords
elif lang == "zh":
stopwords = TU.chinese_stopwords() + custom_stopwords
elif lang in SUPPORTED_LANGS:
stopwords = nltk_stopwords.words(SUPPORTED_LANGS[lang])
else:
stopwords = []
blacklist = stopwords + custom_stopwords
self.blacklist = blacklist
def extract_keywords(
self,
text,
ngram_range=(1, 3),
top_n=10,
n_candidates=50,
omit_scores=False,
candidate_generator="ngrams",
constrain_unigram_case=True,
exclude_unigrams=False,
maxlen=64,
minchars=3,
truncate_to=5000,
score_by="freqpos",
):
"""
```
simple keyword extraction
This is a simplified TextBlob implementation of the KERA algorithm from:
https://arxiv.org/pdf/1308.2359.pdf
Args:
text(str): the text as unicode string
ngram_range(tuple): the ngram range. Example: (1,3) considers unigrams, bigrams, and trigrams as candidates
top_n(int): number of keyphrases to return
n_candidates(int): number of candidates considered, when ranking
omit_scores(bool): If True, no scores are returned.
candidate_generator(str): Either 'noun_phrases' or 'ngrams'.
The default 'ngrams' method will be faster.
contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'.
If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
True is recommended.
contrain_unigram_case(bool): If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA).
True is recommended. Not applied if exclude_unigram=False
exclude_unigrams(bool): If True, unigrams will be excluded from results.
Convenience parameter that is functionally equivalent to changing ngram_range to be above 1.
maxlen(int): maximum number of characters in keyphrase. Default:64
minchars(int): Minimum number of characters in keyword (default:3)
truncate_to(int): Truncate input to this many words (default:5000, i.e., first 5K words).
If None, no truncation is performed.
score_by(str): one of:
'freqpos': average of frequency and position scores
'freq': frequency of occurrence
'pos': position of first occurrence.
Default is 'freqpos'
Returns:
list
```
"""
if candidate_generator not in ["noun_phrases", "ngrams"]:
raise ValueError(
'candidate_generator must be one of {"noun_phrases", "ngrams"}'
)
if self.lang == "zh":
text = " ".join(I.jieba.cut(text, HMM=False))
if candidate_generator == "noun_phrases" and self.lang != "en":
warnings.warn(
f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. '
+ 'Falling back to candidate_generator="ngrams"'
)
candidate_generator = "ngrams"
text = " ".join(text.split()[:truncate_to]) if truncate_to is not None else text
blob = textblob.TextBlob(text)
candidates = []
min_n, max_n = ngram_range
ngram_lens = list(range(min_n, max_n + 1))
# generate ngrams or noun phrases
ngrams = {}
if candidate_generator == "ngrams":
for n in ngram_lens:
ngrams[n] = blob.ngrams(n=n)
else:
noun_phrases = blob.noun_phrases
for np in noun_phrases:
words = np.split()
n = len(words)
if n not in ngram_lens:
continue
if (
not exclude_unigrams
and n == 1
and text.count(" " + words[0].upper() + " ") > 1
):
words[0] = words[0].upper()
lst = ngrams.get(n, [])
lst.append(words)
ngrams[n] = lst
# generate candidates
for n in range(min_n, max_n + 1):
if n == 1:
grams = [
k[0].lower()
for k in ngrams.get(n, [])
if not any(w.lower() in self.blacklist for w in k)
and (
not constrain_unigram_case
and not exclude_unigrams
or (
constrain_unigram_case
and not exclude_unigrams
and k[0].isupper()
)
# or (
# candidate_generator == "noun_phrases"
# and constrain_unigram_case
# and k[0].upper() in text
# )
)
]
else:
grams = [
" ".join(k).lower()
for k in ngrams.get(n, [])
if not any(w.lower() in self.blacklist for w in k)
and len(set(k)) != 1
and len(k[0]) > 1
and len(k[1]) > 1
]
candidates.extend(
[
kw
for kw in grams
if any([c.isalpha() for c in kw[:3]])
and len([w for w in kw if not w.isspace() and w not in ["-", "."]])
>= minchars
and kw[-1].isalnum()
and kw[0].isalnum()
and "@" not in kw
and "." not in kw
and "'" not in kw
]
)
cnt = Counter(candidates)
tups = cnt.most_common(n_candidates)
# normalize and return
tups = [
tup
for tup in tups
if len(tup[0].split()) > 1 or text.count(" " + tup[0].upper() + " ") > 1
]
keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen]
scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen]
scores = [float(i) / sum(scores) for i in scores]
result = list(zip(keywords, scores))
result = result[:top_n]
if score_by in ["freqpos", "pos"]:
text = text.lower()
num_chars = len(text)
result_final = []
for r in result:
first_see = text.find(r[0])
first_see = num_chars - 1 if first_see < 0 else first_see
pos_score = 1 - float(first_see) / num_chars
score = pos_score if score_by == "pos" else (r[1] + pos_score) / 2
result_final.append((r[0], score))
result = result_final
result.sort(key=lambda y: y[1], reverse=True)
return [r[0] for r in result] if omit_scores else result
Classes
class KeywordExtractor (lang='en', custom_stopwords=['et al', 'et', 'al', "n't", 'did', 'does', 'lt', 'gt', 'br'])
-
Keyphrase Extraction
Keyphrase Extraction Args: lang(str): 2-character language code: custom_stopwords(list): list of custom stopwords to ignore
Expand source code
class KeywordExtractor: """ Keyphrase Extraction """ def __init__( self, lang="en", custom_stopwords=["et al", "et", "al", "n't", "did", "does", "lt", "gt", "br"], ): """ ``` Keyphrase Extraction Args: lang(str): 2-character language code: custom_stopwords(list): list of custom stopwords to ignore ``` """ # error checks if not TEXTBLOB_INSTALLED: raise Exception( "The textblob package is required for keyphrase extraction: pip install textblob; python -m textblob.download_corpora" ) if lang not in SUPPORTED_LANGS: raise ValueError( f'lang="{lang}" is not supported. Supported 2-character ISO 639-1 language codes are: {SUPPORTED_LANGS}' ) self.lang = lang # build blacklist from nltk.corpus import stopwords as nltk_stopwords from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS if lang == "en": stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords elif lang == "zh": stopwords = TU.chinese_stopwords() + custom_stopwords elif lang in SUPPORTED_LANGS: stopwords = nltk_stopwords.words(SUPPORTED_LANGS[lang]) else: stopwords = [] blacklist = stopwords + custom_stopwords self.blacklist = blacklist def extract_keywords( self, text, ngram_range=(1, 3), top_n=10, n_candidates=50, omit_scores=False, candidate_generator="ngrams", constrain_unigram_case=True, exclude_unigrams=False, maxlen=64, minchars=3, truncate_to=5000, score_by="freqpos", ): """ ``` simple keyword extraction This is a simplified TextBlob implementation of the KERA algorithm from: https://arxiv.org/pdf/1308.2359.pdf Args: text(str): the text as unicode string ngram_range(tuple): the ngram range. Example: (1,3) considers unigrams, bigrams, and trigrams as candidates top_n(int): number of keyphrases to return n_candidates(int): number of candidates considered, when ranking omit_scores(bool): If True, no scores are returned. candidate_generator(str): Either 'noun_phrases' or 'ngrams'. The default 'ngrams' method will be faster. contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'. If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. contrain_unigram_case(bool): If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. Not applied if exclude_unigram=False exclude_unigrams(bool): If True, unigrams will be excluded from results. Convenience parameter that is functionally equivalent to changing ngram_range to be above 1. maxlen(int): maximum number of characters in keyphrase. Default:64 minchars(int): Minimum number of characters in keyword (default:3) truncate_to(int): Truncate input to this many words (default:5000, i.e., first 5K words). If None, no truncation is performed. score_by(str): one of: 'freqpos': average of frequency and position scores 'freq': frequency of occurrence 'pos': position of first occurrence. Default is 'freqpos' Returns: list ``` """ if candidate_generator not in ["noun_phrases", "ngrams"]: raise ValueError( 'candidate_generator must be one of {"noun_phrases", "ngrams"}' ) if self.lang == "zh": text = " ".join(I.jieba.cut(text, HMM=False)) if candidate_generator == "noun_phrases" and self.lang != "en": warnings.warn( f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. ' + 'Falling back to candidate_generator="ngrams"' ) candidate_generator = "ngrams" text = " ".join(text.split()[:truncate_to]) if truncate_to is not None else text blob = textblob.TextBlob(text) candidates = [] min_n, max_n = ngram_range ngram_lens = list(range(min_n, max_n + 1)) # generate ngrams or noun phrases ngrams = {} if candidate_generator == "ngrams": for n in ngram_lens: ngrams[n] = blob.ngrams(n=n) else: noun_phrases = blob.noun_phrases for np in noun_phrases: words = np.split() n = len(words) if n not in ngram_lens: continue if ( not exclude_unigrams and n == 1 and text.count(" " + words[0].upper() + " ") > 1 ): words[0] = words[0].upper() lst = ngrams.get(n, []) lst.append(words) ngrams[n] = lst # generate candidates for n in range(min_n, max_n + 1): if n == 1: grams = [ k[0].lower() for k in ngrams.get(n, []) if not any(w.lower() in self.blacklist for w in k) and ( not constrain_unigram_case and not exclude_unigrams or ( constrain_unigram_case and not exclude_unigrams and k[0].isupper() ) # or ( # candidate_generator == "noun_phrases" # and constrain_unigram_case # and k[0].upper() in text # ) ) ] else: grams = [ " ".join(k).lower() for k in ngrams.get(n, []) if not any(w.lower() in self.blacklist for w in k) and len(set(k)) != 1 and len(k[0]) > 1 and len(k[1]) > 1 ] candidates.extend( [ kw for kw in grams if any([c.isalpha() for c in kw[:3]]) and len([w for w in kw if not w.isspace() and w not in ["-", "."]]) >= minchars and kw[-1].isalnum() and kw[0].isalnum() and "@" not in kw and "." not in kw and "'" not in kw ] ) cnt = Counter(candidates) tups = cnt.most_common(n_candidates) # normalize and return tups = [ tup for tup in tups if len(tup[0].split()) > 1 or text.count(" " + tup[0].upper() + " ") > 1 ] keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen] scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen] scores = [float(i) / sum(scores) for i in scores] result = list(zip(keywords, scores)) result = result[:top_n] if score_by in ["freqpos", "pos"]: text = text.lower() num_chars = len(text) result_final = [] for r in result: first_see = text.find(r[0]) first_see = num_chars - 1 if first_see < 0 else first_see pos_score = 1 - float(first_see) / num_chars score = pos_score if score_by == "pos" else (r[1] + pos_score) / 2 result_final.append((r[0], score)) result = result_final result.sort(key=lambda y: y[1], reverse=True) return [r[0] for r in result] if omit_scores else result
Methods
def extract_keywords(self, text, ngram_range=(1, 3), top_n=10, n_candidates=50, omit_scores=False, candidate_generator='ngrams', constrain_unigram_case=True, exclude_unigrams=False, maxlen=64, minchars=3, truncate_to=5000, score_by='freqpos')
-
simple keyword extraction This is a simplified TextBlob implementation of the KERA algorithm from: <https://arxiv.org/pdf/1308.2359.pdf> Args -----= text(str): the text as unicode string ngram_range(tuple): the ngram range. Example: (1,3) considers unigrams, bigrams, and trigrams as candidates top_n(int): number of keyphrases to return n_candidates(int): number of candidates considered, when ranking omit_scores(bool): If True, no scores are returned. candidate_generator(str): Either 'noun_phrases' or 'ngrams'. The default 'ngrams' method will be faster. contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'. If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. contrain_unigram_case(bool): If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. Not applied if exclude_unigram=False exclude_unigrams(bool): If True, unigrams will be excluded from results. Convenience parameter that is functionally equivalent to changing ngram_range to be above 1. maxlen(int): maximum number of characters in keyphrase. Default:64 minchars(int): Minimum number of characters in keyword (default:3) truncate_to(int): Truncate input to this many words (default:5000, i.e., first 5K words). If None, no truncation is performed. score_by(str): one of: 'freqpos': average of frequency and position scores 'freq': frequency of occurrence 'pos': position of first occurrence. Default is 'freqpos' Returns: list
Expand source code
def extract_keywords( self, text, ngram_range=(1, 3), top_n=10, n_candidates=50, omit_scores=False, candidate_generator="ngrams", constrain_unigram_case=True, exclude_unigrams=False, maxlen=64, minchars=3, truncate_to=5000, score_by="freqpos", ): """ ``` simple keyword extraction This is a simplified TextBlob implementation of the KERA algorithm from: https://arxiv.org/pdf/1308.2359.pdf Args: text(str): the text as unicode string ngram_range(tuple): the ngram range. Example: (1,3) considers unigrams, bigrams, and trigrams as candidates top_n(int): number of keyphrases to return n_candidates(int): number of candidates considered, when ranking omit_scores(bool): If True, no scores are returned. candidate_generator(str): Either 'noun_phrases' or 'ngrams'. The default 'ngrams' method will be faster. contrain_unigram_case(bool): Only applies if candidate_generator=='ngrams'. If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. contrain_unigram_case(bool): If True, only unigrams in uppercase are returned (e.g., LDA, SVM, NASA). True is recommended. Not applied if exclude_unigram=False exclude_unigrams(bool): If True, unigrams will be excluded from results. Convenience parameter that is functionally equivalent to changing ngram_range to be above 1. maxlen(int): maximum number of characters in keyphrase. Default:64 minchars(int): Minimum number of characters in keyword (default:3) truncate_to(int): Truncate input to this many words (default:5000, i.e., first 5K words). If None, no truncation is performed. score_by(str): one of: 'freqpos': average of frequency and position scores 'freq': frequency of occurrence 'pos': position of first occurrence. Default is 'freqpos' Returns: list ``` """ if candidate_generator not in ["noun_phrases", "ngrams"]: raise ValueError( 'candidate_generator must be one of {"noun_phrases", "ngrams"}' ) if self.lang == "zh": text = " ".join(I.jieba.cut(text, HMM=False)) if candidate_generator == "noun_phrases" and self.lang != "en": warnings.warn( f'lang={self.lang} but candidate_generator="noun_phrases" is not supported. ' + 'Falling back to candidate_generator="ngrams"' ) candidate_generator = "ngrams" text = " ".join(text.split()[:truncate_to]) if truncate_to is not None else text blob = textblob.TextBlob(text) candidates = [] min_n, max_n = ngram_range ngram_lens = list(range(min_n, max_n + 1)) # generate ngrams or noun phrases ngrams = {} if candidate_generator == "ngrams": for n in ngram_lens: ngrams[n] = blob.ngrams(n=n) else: noun_phrases = blob.noun_phrases for np in noun_phrases: words = np.split() n = len(words) if n not in ngram_lens: continue if ( not exclude_unigrams and n == 1 and text.count(" " + words[0].upper() + " ") > 1 ): words[0] = words[0].upper() lst = ngrams.get(n, []) lst.append(words) ngrams[n] = lst # generate candidates for n in range(min_n, max_n + 1): if n == 1: grams = [ k[0].lower() for k in ngrams.get(n, []) if not any(w.lower() in self.blacklist for w in k) and ( not constrain_unigram_case and not exclude_unigrams or ( constrain_unigram_case and not exclude_unigrams and k[0].isupper() ) # or ( # candidate_generator == "noun_phrases" # and constrain_unigram_case # and k[0].upper() in text # ) ) ] else: grams = [ " ".join(k).lower() for k in ngrams.get(n, []) if not any(w.lower() in self.blacklist for w in k) and len(set(k)) != 1 and len(k[0]) > 1 and len(k[1]) > 1 ] candidates.extend( [ kw for kw in grams if any([c.isalpha() for c in kw[:3]]) and len([w for w in kw if not w.isspace() and w not in ["-", "."]]) >= minchars and kw[-1].isalnum() and kw[0].isalnum() and "@" not in kw and "." not in kw and "'" not in kw ] ) cnt = Counter(candidates) tups = cnt.most_common(n_candidates) # normalize and return tups = [ tup for tup in tups if len(tup[0].split()) > 1 or text.count(" " + tup[0].upper() + " ") > 1 ] keywords = [tup[0] for tup in tups if len(tup[0]) <= maxlen] scores = [tup[1] for tup in tups if len(tup[0]) <= maxlen] scores = [float(i) / sum(scores) for i in scores] result = list(zip(keywords, scores)) result = result[:top_n] if score_by in ["freqpos", "pos"]: text = text.lower() num_chars = len(text) result_final = [] for r in result: first_see = text.find(r[0]) first_see = num_chars - 1 if first_see < 0 else first_see pos_score = 1 - float(first_see) / num_chars score = pos_score if score_by == "pos" else (r[1] + pos_score) / 2 result_final.append((r[0], score)) result = result_final result.sort(key=lambda y: y[1], reverse=True) return [r[0] for r in result] if omit_scores else result