I to be doing news classification on TagMyNews dataset which has actually structure favor below:
court agrees to expedite n.f.l.'s appeal the decision method a ruling might be made virtually two months prior to regular season begins, time for the political parties to job-related out a transaction without delaying the season. sport. From this ns accessed the headline,brief description and category, and also create range like this:
import pandas
import glob
category_list = <"sport", "world", "us", "business", "health", "entertainment", "sci_tech">
directory_list = <"data/sport/*.txt", "data/world/*.txt","data/us/*.txt","data/business/*.txt","data/health/*.txt","data/entertainment/*.txt","data/sci_tech/*.txt",>
text_files = list(map(lambda x: glob.glob(x), directory_list))
#print(text_files)
text_files =
'data': "court agrees to expedite n.f.l.'s very nice one the decision means a ruling might be made virtually two months prior to the continuous season begins, time because that the political parties to work-related out a deal without delaying the season.", 'flag': 0 Then i conserve them in a csv document as shown below:
training_data = pandas.DataFrame(training_data, columns=<'data', 'flag'>) training_data.to_csv("train_data.csv", sep=',', encoding='utf-8') Now i desire to divide the data right into seven classification which have effectively done but i can"t plot ROC because that each course by this data
print(__doc__)
income numpy as np
import matplotlib.pyplot as plt
from itertools income cycle
native sklearn income svm, datasets
indigenous sklearn.metrics import roc_curve, auc
native sklearn.model_selection import train_test_split
native sklearn.preprocessing import label_binarize
indigenous sklearn.multiclass income OneVsRestClassifier
from scipy income interp
# Import part data come play with
X = np.array(training_data.data)
y = np.array(training_data.flag)
# Binarize the output
y = label_binarize(y, classes=<0, 1, 2,3,4,5,6>)
n_classes = y.shape<1>
import pickle
from sklearn.feature_extraction.text income CountVectorizer
#GET VECTOR COUNT
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data.data)
#print(count_vect.get_feature_names())
#SAVE indigenous VECTOR
pickle.dump(count_vect.vocabulary_, open("count_vector.pkl","wb"))
from sklearn.feature_extraction.text import TfidfTransformer
#TRANSFORM indigenous VECTOR come TF IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#print(X_train_tfidf)
#SAVE TF-IDF
pickle.dump(tfidf_transformer, open("tfidf.pkl","wb"))
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data.flag, test_size=0.20, random_state=42)
# find out to predict every class against the other
divide = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
random_state=42))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for ns in range(n_classes):
fpr, tpr, _ = roc_curve(y_test<:, i>, y_score<:, i>)
roc_auc = auc(fpr, tpr)
# Compute micro-average ROC curve and also ROC area
fpr<"micro">, tpr<"micro">, _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc<"micro"> = auc(fpr<"micro">, tpr<"micro">)
automatically created module because that IPython interactive environment
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
55 roc_auc = dict()
56 for i in range(n_classes):
---> 57 fpr, tpr, _ = roc_curve(y_test<:, i>, y_score<:, i>)
58 roc_auc = auc(fpr, tpr)
59
C:ProgramDataAnaconda3libsite-packagespandascoreseries.py in __getitem__(self, key)
909 crucial = check_bool_indexer(self.index, key)
910
--> 911 return self._get_with(key)
912
913 def _get_with(self, key):
C:ProgramDataAnaconda3libsite-packagespandascoreseries.py in _get_with(self, key)
921 elif isinstance(key, tuple):
922 try:
--> 923 return self._get_values_tuple(key)
924 other than Exception:
925 if len(key) == 1:
C:ProgramDataAnaconda3libsite-packagespandascoreseries.py in _get_values_tuple(self, key)
966
967 if not isinstance(self.index, MultiIndex):
--> 968 raise ValueError('Can only tuple-index through a MultiIndex')
969
970 # If vital is contained, would have returned through now
ValueError: deserve to only tuple-index through a MultiIndex