Wednesday, October 14, 2015

Machine Learning - SGD with sci-kit learning

**************Work in Progress********** Will update with comments and the file to use

import numpy as np
import ast
import copy
from numpy import *
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix


x_temp_list = []
y_temp_list = []
y_train_text = []
temp_list = []
another_temp_list = []

# Comma seperated test/train file
with open('train.csv','r') as fp:
        lines = fp.readlines()

for line in lines:
        if len(line.strip()) > 1:
                fields = line.split(',')
                if len(line.split(',')) == 2:
                        x_temp_list.append(fields[0].strip())
                        y_temp_list.append(fields[1].strip())

X_train = np.array(x_temp_list)
y_train_text = np.array(y_temp_list)

#X_test = np.array(['Cashier Colestru Value Foods'])


#mlb = preprocessing.LabelBinarizer()
#Y = mlb.fit_transform(y_train_text)
count_vect = CountVectorizer()
y_train_counts = count_vect.fit_transform(y_train_text)

tfidf_transformer = TfidfTransformer()
Y=tfidf_transformer.fit_transform(y_train_counts)

classifier =  Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=50)), ])

classifier.fit(X_train, y_train_text)


print ("Predticing and creating confusion matrix")

y_pred = classifier.predict(X_train)

print(confusion_matrix(y_train_text, y_pred,labels=None))

No comments:

Post a Comment