import numpy as np
import pandas as pd
import pickle
from pathlib import Path
import seaborn as sns
"whitegrid")
sns.set_style(
import matplotlib.pyplot as plt
Text Classification (Reuters-21578)
Prepare Dataset
Download
from ds_tut import download_from_url
= "reuters21578.tar.gz"
archive_name = "http://www.daviddlewis.com/resources/testcollections/reuters21578/{}".format(archive_name)
training_data_url = Path.cwd() / "data" / "tmp"
data_root =True, exist_ok=True)
data_root.mkdir(parents= data_root / archive_name
training_data_path = data_root / archive_name.split(".")[0]
reuters_dir = download_from_url(training_data_url, training_data_path) data_size
Unpack
import tarfile
= tarfile.open(str(training_data_path))
tar =str(reuters_dir))
tar.extractall(path tar.close()
Parse
import pickle
from ds_tut.datasets import ReutersParser, ReutersCorpus
= []
documents = ReutersParser()
rp for sgml_path in reuters_dir.glob("*.sgm"):
for doc in rp.parse_sgml(str(sgml_path)):
"filename"] = sgml_path
doc[
documents.append(doc)
= reuters_dir / "documents.pkl"
pickle_path with open(str(pickle_path), "wb") as f:
pickle.dump(documents, f)
= ReutersCorpus(documents)
reuters
= reuters_dir / "corpus.pkl"
pickle_path with open(str(pickle_path), "wb") as f:
pickle.dump(reuters, f)
Explore Reuters-21578
Load dataset
from ds_tut.datasets import ReutersCorpus
= Path.cwd() / "data" / "tmp"
data_root = data_root / "reuters21578" / "documents.pkl"
reuters_documents_path = data_root / "reuters21578" / "corpus.pkl"
reuters_corpus_path
= pickle.load(open(reuters_documents_path, "rb"))
documents = pickle.load(open(reuters_corpus_path, "rb"))
reuters = reuters.build_dataframe(pd=pd)
df, top_ten_ids, train_labels, test_labels = reuters.split_modapte() train, test
Get some simple stats
= reuters.number_of_samples
number_of_samples = reuters.number_of_classes
number_of_classes = int(np.average([tc for tc in reuters.topic_counts.values() if tc > 1]))
number_of_samples_per_class = int(np.median([len(d["text"].split()) for d in reuters.docs]))
number_of_words_per_sample = int(number_of_samples / number_of_words_per_sample) samples_to_words_per_sample_ratio
= 52
nchars print("Number of samples:".ljust(nchars), reuters.number_of_samples)
print("Number of classes:".ljust(nchars), reuters.number_of_classes)
print("Number of samples per class:".ljust(nchars), number_of_samples_per_class)
print("Number of words per sample:".ljust(nchars), number_of_words_per_sample)
print("Number of samples/number of words per sample ratio:".ljust(nchars), samples_to_words_per_sample_ratio)
Number of samples: 10789
Number of classes: 119
Number of samples per class: 148
Number of words per sample: 89
Number of samples/number of words per sample ratio: 121
Distribution of sample length for reuters21578
= plt.subplots(figsize=(15, 10))
fig, ax len(d["text"]) for d in reuters.docs], kde=True, ax=ax)
sns.histplot(['Sample length distribution')
ax.set_title('Length of a sample')
ax.set_xlabel(0, 8000)
ax.set_xlim(= ax.set_ylabel('Number of samples') _
Word frequency distribution
from sklearn.feature_extraction.text import CountVectorizer
= {
kwargs 'ngram_range': (1, 1),
'dtype': 'int32',
'strip_accents': 'unicode',
'decode_error': 'replace',
'analyzer': 'word', # Split text into word tokens.
}= CountVectorizer(**kwargs)
vectorizer = vectorizer.fit_transform(reuters.texts)
vectorized_texts = list(vectorizer.get_feature_names_out())
all_ngrams = vectorized_texts.sum(axis=0).tolist()[0]
all_counts = zip(*[(c, n) for c, n in sorted(
all_counts, all_ngrams zip(all_counts, all_ngrams), reverse=True)])
=50
num_ngrams= list(all_ngrams)[:num_ngrams]
ngrams = list(all_counts)[:num_ngrams]
counts = np.arange(num_ngrams) idx
= plt.subplots(figsize=(15, 10))
fig, ax =ngrams, y=counts, ax=ax, color="#95BCD9")
sns.barplot(x'Sample length distribution')
ax.set_title('N-grams')
plt.xlabel(= plt.ylabel('Frequencies')
_ = plt.title('Frequency distribution of n-grams')
_ = plt.xticks(idx, ngrams, rotation=45) _
Choose a model flowchart
from IPython.display import display, HTML
= "https://developers.google.com/machine-learning/guides/text-classification/images/TextClassificationFlowchart.png"
url = f"""
html_code <div>
<p>Number of samples/number of words per sample ratio: {samples_to_words_per_sample_ratio}</p>
<img src="{url}" style="width: 100%">
</div>
"""
display(HTML(html_code))
Number of samples/number of words per sample ratio: 121

Simple Linear Model
Ok, pretty impressive image. How about to try just a very simple linear model?
Get text and labels
from sklearn.preprocessing import MultiLabelBinarizer
= reuters.split_modapte()
train_docs, test_docs print(len(train_docs), len(test_docs))
= [d["text"] for d in train_docs]
train = reuters.get_labels(train_docs)
train_labels = MultiLabelBinarizer().fit_transform(train_labels)
y_train
= [d["text"] for d in test_docs]
test = reuters.get_labels(test_docs)
test_labels = MultiLabelBinarizer().fit_transform(test_labels) y_test
7770 3019
Vectorize Texts
from sklearn.feature_extraction.text import TfidfVectorizer
= TfidfVectorizer()
vectorizer
vectorizer.fit(train)
= vectorizer.transform(train)
X_train = vectorizer.transform(test) X_test
Evaluate Models
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
= reuters.top_n(n=10) top_ten_ids, top_ten_names
Logistic Regression
This is usually one of the first models to try. Simple, robust, fast, elegant. One of the best baseline methods.
from sklearn.linear_model import LogisticRegression
# model = OneVsRestClassifier(LogisticRegression(C=100, solver="liblinear", multi_class="ovr"))
= OneVsRestClassifier(LogisticRegression(solver="liblinear", multi_class="ovr"))
model
model.fit(X_train, y_train)= model.predict(X_test)
y_pred print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, zero_division=0.0))
precision recall f1-score support
earn 0.99 0.97 0.98 1087
acq 0.98 0.92 0.95 719
money-fx 0.78 0.51 0.62 179
grain 0.99 0.60 0.75 149
crude 0.96 0.57 0.72 189
trade 0.93 0.54 0.68 117
interest 0.91 0.47 0.62 131
ship 1.00 0.13 0.24 89
wheat 0.97 0.51 0.67 71
corn 0.95 0.32 0.48 56
micro avg 0.97 0.79 0.87 2787
macro avg 0.95 0.56 0.67 2787
weighted avg 0.97 0.79 0.85 2787
samples avg 0.70 0.69 0.69 2787
Linear Support Vector Machine
from sklearn.svm import LinearSVC
= OneVsRestClassifier(LinearSVC(dual=True))
model
model.fit(X_train, y_train)= model.predict(X_test)
y_pred print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3, zero_division=0.0))
precision recall f1-score support
earn 0.991 0.980 0.985 1087
acq 0.984 0.950 0.967 719
money-fx 0.810 0.788 0.799 179
grain 0.975 0.799 0.878 149
crude 0.906 0.868 0.886 189
trade 0.830 0.709 0.765 117
interest 0.870 0.664 0.753 131
ship 0.924 0.685 0.787 89
wheat 0.929 0.732 0.819 71
corn 0.955 0.750 0.840 56
micro avg 0.956 0.896 0.925 2787
macro avg 0.917 0.793 0.848 2787
weighted avg 0.954 0.896 0.922 2787
samples avg 0.771 0.769 0.767 2787
Precision/Recall-Curve
One way to visualize the performance of a classifier.
= df.query("modapte == 'train'")
df_train = df.query("modapte == 'test'")
df_test
= MultiLabelBinarizer()
mlb = mlb.fit_transform(df_train.label)
y_train = mlb.transform(df_test.label)
y_test print(df_train.shape, df_test.shape)
= reuters_dir / "cache" cache_dir
(7770, 9) (3019, 9)
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
class EmptyFitMixin:
def fit(self, x, y=None):
return self
class TextStats(BaseEstimator, EmptyFitMixin, TransformerMixin):
"""Extract features from each document"""
def transform(self, col):
= col.str
tc = [
features len(), # character count
tc.r"\n"), # line count
tc.count(r"\."), # sentence count
tc.count(apply(lambda x: len(x) if x is not None else 0), # word count
tc.split().
]= np.concatenate([f.values.reshape(-1, 1) for f in features], axis=1)
features = np.isnan(features)
where_are_NaNs = 0
features[where_are_NaNs] return features.astype(np.float64)
class TextFromPandasColumns(EmptyFitMixin, BaseEstimator, TransformerMixin):
"""Extract the text from a list of columns in a single pass.
Takes a pandas dataframe and produces a series of texts
from joined columns defined in `text_cols`.
"""
= ["title", "body"]
text_cols
def transform(self, df):
def join(items, axis=None):
return " ".join([str(item) for item in items])
= df[self.text_cols].apply(lambda x: "" if x.iloc[0] is None else x, axis=1)
data = data.apply(join, axis=1)
texts return texts
class ColumnSelector(EmptyFitMixin, BaseEstimator, TransformerMixin):
def __init__(self, column, filter_none=True):
self.column = column
self.filter_none = filter_none
def transform(self, df):
= df[self.column]
col if self.filter_none:
= col.apply(lambda x: "" if x is None else x)
col return col
= Pipeline(
pipeline =str(cache_dir),
memory=[
steps"union", FeatureUnion(n_jobs=1, transformer_list=[
("title_stats", Pipeline([
("column", ColumnSelector("title")),
("stats", TextStats()),
("scaled", StandardScaler()),
(
])),"body_stats", Pipeline([
("column", ColumnSelector("body")),
("stats", TextStats()),
("scaled", StandardScaler()),
(
])),"combined_text", Pipeline([
("column", TextFromPandasColumns()),
("tfidf", TfidfVectorizer()),
(
])),
])),"clf", OneVsRestClassifier(LinearSVC(C=1, dual=True, max_iter=20000))),
( ])
pipeline.fit(df_train, y_train)= pipeline.predict(df_test)
y_pred print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3, zero_division=0.0))
= pipeline.decision_function(df_test)
y_score
from sklearn.preprocessing import label_binarize
= label_binarize(y_test, classes=list(range(y_score.shape[1]))) y_test_bin
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
# For each class
= {}, {}, {}
precision, recall, average_precision for i in range(y_score.shape[1]):
= precision_recall_curve(y_test_bin[:, i], y_score[:, i])
precision[i], recall[i], _ = average_precision_score(y_test_bin[:, i], y_score[:, i])
average_precision[i]
# A "micro-average": quantifying score on all classes jointly
"micro"], recall["micro"], _ = precision_recall_curve(y_test_bin.ravel(), y_score.ravel())
precision["micro"] = average_precision_score(y_test_bin, y_score, average="micro")
average_precision[print('Average precision score, micro-averaged over all classes: {0:0.2f}'
format(average_precision["micro"])) .
# precision recall breakeven point
= None
prbp for num, (p, r) in enumerate(zip(precision["micro"], recall["micro"])):
if p == r:
print(num, p, r)
= p prbp
=(15, 10))
plt.figure(figsize'micro'], precision['micro'], color='b', alpha=0.2,
plt.step(recall[='post')
where"micro"], precision["micro"], step='post', alpha=0.2,
plt.fill_between(recall[='b')
color
'Recall')
plt.xlabel('Precision')
plt.ylabel(0.0, 1.05])
plt.ylim([0.0, 1.0])
plt.xlim([
plt.title('Average precision score, micro-averaged over all classes: AP={0:0.2f}'
format(average_precision["micro"])) .