First, I had to install Virtual Box and figure out how that worked. Then I had to install Sage in the Virtual Box and figure out how that worked. Then I had to figure out how to run an SVM in Sage.

But I've done all that and I want to post the results and the code. Later on, I'll do a post on Virtual Box.

I used the data from Problem set #1 since it is a small set and easy to use. In order to use it in the SVM code, I had to do two things: first I had to combine the y data with the x data into one CSV file. Then I had to rearrange the data. The original data was sorted so that all the y=0 data comes first followed by the y=1 data. Since I wanted to use only 70% of the data for the analysis and 30% of the data to test accuracy, I had to resort the data so that the y=0 and y=1 data were interspersed evenly.

Below are the results and the graph. The first part of the solution is the output from the convex optimization done by the computer. The verbiage under "Optimal solution found" is my output from the analysis. I have three outputs because I used three different penalty constants: 0.1, 10, and 100.

pcost dcost gap pres dres 0: -7.0774e+00 -1.3421e+01 5e+02 2e+01 1e-14 1: -1.2084e+00 -1.2358e+01 3e+01 8e-01 1e-14 2: -5.3246e-01 -4.0847e+00 4e+00 3e-02 2e-15 3: -6.3668e-01 -1.2676e+00 7e-01 5e-03 9e-16 4: -7.7552e-01 -9.4394e-01 2e-01 1e-03 9e-16 5: -8.1966e-01 -8.4799e-01 3e-02 6e-05 8e-16 6: -8.2948e-01 -8.3229e-01 3e-03 5e-06 1e-15 7: -8.3049e-01 -8.3053e-01 4e-05 6e-08 1e-15 8: -8.3051e-01 -8.3051e-01 4e-07 6e-10 1e-15 Optimal solution found. The percent of support vectors for C=.1 (note: this is underfitting) is 17 percent and there are 12 support vectors. The weight matrix is [ 0.59580336 0.59525422] The model accuracy is 0.896551724138

pcost dcost gap pres dres 0: 1.0143e+02 -3.0514e+05 7e+05 5e-01 7e-13 1: 1.3516e+03 -5.6065e+04 9e+04 4e-02 6e-13 2: 9.5288e+02 -1.1776e+04 2e+04 7e-03 4e-13 3: 1.2074e+01 -3.0995e+03 3e+03 3e-15 3e-13 4: -2.3429e+02 -4.7539e+02 2e+02 2e-15 2e-13 5: -2.5664e+02 -4.1207e+02 2e+02 2e-15 3e-13 6: -3.2509e+02 -4.1261e+02 9e+01 8e-15 4e-13 7: -3.4242e+02 -3.4631e+02 4e+00 5e-15 3e-13 8: -3.4389e+02 -3.4393e+02 4e-02 3e-15 4e-13 9: -3.4390e+02 -3.4390e+02 4e-04 3e-15 5e-13 10: -3.4390e+02 -3.4390e+02 4e-06 4e-15 5e-13 Optimal solution found. The percent of support vectors for C=100 (note: this is overfitting) is 5 percent and there are 4 support vectors. The weight matrix is [ 2.02253604 1.20665256] The model accuracy is 0.862068965517

pcost dcost gap pres dres 0: -6.2666e+01 -3.9389e+03 1e+04 7e-01 9e-14 1: -2.0139e+01 -8.5498e+02 1e+03 5e-02 9e-14 2: -1.2994e+01 -1.5479e+02 2e+02 6e-03 3e-14 3: -2.2810e+01 -6.2679e+01 4e+01 1e-15 3e-14 4: -2.8171e+01 -4.6504e+01 2e+01 8e-16 2e-14 5: -3.3784e+01 -4.1343e+01 8e+00 1e-15 3e-14 6: -3.2401e+01 -3.9318e+01 7e+00 5e-16 2e-14 7: -3.4709e+01 -3.6954e+01 2e+00 1e-15 4e-14 8: -3.5487e+01 -3.6151e+01 7e-01 6e-16 4e-14 9: -3.5741e+01 -3.5894e+01 2e-01 1e-15 4e-14 10: -3.5787e+01 -3.5789e+01 2e-03 1e-15 4e-14 11: -3.5788e+01 -3.5788e+01 2e-05 2e-16 4e-14 Optimal solution found. The percent of support vectors for C=10 ) is 7 percent and there are 5 support vectors. The weight matrix is [ 0.95881111 0.84382082] The model accuracy is 0.896551724138

Here is the code:

import csv as csv

import numpy as np

csv_file_object=csv.reader(open(DATA+'reps1.csv'))

header=csv_file_object.next()

records=[]

for row in csv_file_object:records.append(row)

records=np.array(records)

data=records.astype(np.float)

data[:,0][data[:,0]==0]=-1

#Linear model

m=np.size(data[:,0])

testm=int(.7*m)

datatr=data[0:testm,:]

datatest=data[testm+1:m,:]

from numpy import linalg

import cvxopt

import cvxopt.solvers

trx=datatr[:,1:]

trw=datatr[:,0]

y=trw.reshape(testm,1)

n=np.size(trx,1)

K=np.zeros((testm,testm))

for i in range(testm):

for j in range(testm):

K[i,j]=np.dot(trx[i],trx[j])

P=cvxopt.matrix(np.outer(trw,trw)*K)

q=cvxopt.matrix(np.ones(testm)*-1)

b=cvxopt.matrix(0.0)

A=cvxopt.matrix(np.array(y),(1,testm))

G1=cvxopt.matrix(np.diag(np.ones(testm)*-1))

G2=cvxopt.matrix(np.eye(testm))

G=cvxopt.matrix(np.vstack((G1,G2)))

h1=cvxopt.matrix(np.zeros(testm))

h2=cvxopt.matrix(np.ones(testm)*10)

h=cvxopt.matrix(np.vstack((h1,h2)))

solution=cvxopt.solvers.qp(P,q,G,h,A,b)

a=np.ravel(solution['x'])

sv=a[a>.00001]

p=np.size(sv)*100/testm

print 'The percent of support vectors for C=10 ) is',p,"percent and there are ", np.size(sv), "support vectors."

v=np.append(datatr,a.reshape(testm,1),axis=1)

n=np.size(v,axis=1)

z=v[v[:,n-1]>.00001]

ls=np.size(z[:,0])

w=np.zeros(np.size(trx, axis=1))

for t in range(ls):

w=w+z[t,n-1]*z[t,0]*z[t,1:n-1]

print "The weight matrix is ",w

yp=0

bp2=0

for i in range(ls):

for j in range(ls):

bp2=bp2+z[j,n-1]*z[j,0]*np.dot(z[i,1:n-1],z[j,1:n-1])

yp=yp+z[i,0]

bf=(yp-bp2)/ls

ac=0

bc=0

cc=0

dc=0

l=np.size(datatest[:,0])

y_p=np.zeros(l)

for k in range(l):

y_p[k]=np.dot(w,datatest[k,1:].reshape(np.size(trx,1),1))+bf

if y_p[k]<0:

s=-1

if datatest[k,0]==s:

ac=ac+1

else:

bc=bc+1

else:

s=1

if datatest[k,0]==s:

dc=dc+1

else:

dc=dc+1

acc=float(ac+dc)/float(l)

print "The model accuracy is ",acc

u1=v[v[:,0]==-1]

u2=v[v[:,0]==1]

from pylab import *

xvar=np.linspace(trx.min(),trx.max(),10)

clf()

x20=(w[0]*xvar)/(-1*w[1])+bf/(w[1]*-1)

x21=(w[0]*xvar+bf-1)/w[1]*-1

x2n1=(w[0]*xvar+bf+1)/w[1]*-1

plot(xvar,x20)

plot(xvar,x21)

plot(xvar,x2n1)

plot(u1[:,1],u1[:,2],'rx')

plot(u2[:,1],u2[:,2],'bx')

plot(z[:,1],z[:,2],'go')

savefig('sageplt.png')

import numpy as np

csv_file_object=csv.reader(open(DATA+'reps1.csv'))

header=csv_file_object.next()

records=[]

for row in csv_file_object:records.append(row)

records=np.array(records)

data=records.astype(np.float)

data[:,0][data[:,0]==0]=-1

#Linear model

m=np.size(data[:,0])

testm=int(.7*m)

datatr=data[0:testm,:]

datatest=data[testm+1:m,:]

from numpy import linalg

import cvxopt

import cvxopt.solvers

trx=datatr[:,1:]

trw=datatr[:,0]

y=trw.reshape(testm,1)

n=np.size(trx,1)

K=np.zeros((testm,testm))

for i in range(testm):

for j in range(testm):

K[i,j]=np.dot(trx[i],trx[j])

P=cvxopt.matrix(np.outer(trw,trw)*K)

q=cvxopt.matrix(np.ones(testm)*-1)

b=cvxopt.matrix(0.0)

A=cvxopt.matrix(np.array(y),(1,testm))

G1=cvxopt.matrix(np.diag(np.ones(testm)*-1))

G2=cvxopt.matrix(np.eye(testm))

G=cvxopt.matrix(np.vstack((G1,G2)))

h1=cvxopt.matrix(np.zeros(testm))

h2=cvxopt.matrix(np.ones(testm)*10)

h=cvxopt.matrix(np.vstack((h1,h2)))

solution=cvxopt.solvers.qp(P,q,G,h,A,b)

a=np.ravel(solution['x'])

sv=a[a>.00001]

p=np.size(sv)*100/testm

print 'The percent of support vectors for C=10 ) is',p,"percent and there are ", np.size(sv), "support vectors."

v=np.append(datatr,a.reshape(testm,1),axis=1)

n=np.size(v,axis=1)

z=v[v[:,n-1]>.00001]

ls=np.size(z[:,0])

w=np.zeros(np.size(trx, axis=1))

for t in range(ls):

w=w+z[t,n-1]*z[t,0]*z[t,1:n-1]

print "The weight matrix is ",w

yp=0

bp2=0

for i in range(ls):

for j in range(ls):

bp2=bp2+z[j,n-1]*z[j,0]*np.dot(z[i,1:n-1],z[j,1:n-1])

yp=yp+z[i,0]

bf=(yp-bp2)/ls

ac=0

bc=0

cc=0

dc=0

l=np.size(datatest[:,0])

y_p=np.zeros(l)

for k in range(l):

y_p[k]=np.dot(w,datatest[k,1:].reshape(np.size(trx,1),1))+bf

if y_p[k]<0:

s=-1

if datatest[k,0]==s:

ac=ac+1

else:

bc=bc+1

else:

s=1

if datatest[k,0]==s:

dc=dc+1

else:

dc=dc+1

acc=float(ac+dc)/float(l)

print "The model accuracy is ",acc

u1=v[v[:,0]==-1]

u2=v[v[:,0]==1]

from pylab import *

xvar=np.linspace(trx.min(),trx.max(),10)

clf()

x20=(w[0]*xvar)/(-1*w[1])+bf/(w[1]*-1)

x21=(w[0]*xvar+bf-1)/w[1]*-1

x2n1=(w[0]*xvar+bf+1)/w[1]*-1

plot(xvar,x20)

plot(xvar,x21)

plot(xvar,x2n1)

plot(u1[:,1],u1[:,2],'rx')

plot(u2[:,1],u2[:,2],'bx')

plot(z[:,1],z[:,2],'go')

savefig('sageplt.png')

It's not all good with Sage. I had some major problems with the CVXOPT module and some other annoying things that sent me back to IPython. I'll detail those in a later post.

## No comments:

## Post a Comment