机器学习 1 linear regression 作业

话说学机器学习，不写代码就太扯淡了。好了，接着上一次的线性回归作业。

hw1作业的链接在这： http://speech.ee.ntu.edu.tw/~tlkagk/courses/ML_2016/Lecture/hw1.pdf

作业是预测台湾的PM2.5的指数，既然是回归问题，肯定是用的是上一节课的线性回归了。

以上数据我传到https://pan.baidu.com/s/1dFhwT13 上面了，供有兴趣的人做做。

实际上上述中分为训练数据和测试数据，都是CSV格式的，而且只用到PM2.5有用，其他的没什么用，同时通过测试数据才知道，

其实就是用前9个小时的PM2.5数据作为特征，来预测第10个小时的数据，将第10个小时的数据保存为csv格式，作为预测结果。

好了，不多说，上代码。我的开发环境还是win7+pycharm4.0

第一步，读取train.csv. 获取PM2.5的训练数据，一共240个训练数据，将前9个小时的数据作为特征，将第10个小时的数据作为标签

View Code

第二步，利用梯度下降来训练权值和偏置。

View Code

第三步，测试训练集。这个可以不需要，是我调试过程中看，对训练集的预测精度怎么样？

View Code

第四步，运行测试集，并保存测试结果。

首先读取测试集的数据，和训练集一样

View Code

保存预测结果到csv文件中：

View Code

完整的程序：

  1 # -*- coding:UTF-8 -*-
  2 __author__ = 'tao'
  3 
  4 import csv
  5 import cv2
  6 import sys
  7 import numpy as np
  8 import math
  9 
 10 filename = 'F:/台湾机器学习/data/train.csv'
 11 ufilename = unicode(filename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
 12 list=[]
 13 result=[]
 14 row=0
 15 colum=0;
 16 with open(ufilename, 'r') as f:
 17     data = f.readlines()  #dat中所有字符串读入data
 18     for line in data:
 19         odom = line.split(',')        #将单个数据分隔开存好
 20         colum=len(odom)
 21         if 'PM2.5'in odom:
 22             lists= map(int, odom[3:12])#第三个开始开始数据  一直取9个数
 23             results= map(int, odom[12:13])#取第10个数
 24             list.append(lists)
 25             result.append(results)
 26             # print odom
 27         row=row+1
 28 
 29 #print("原始数据是：{0}行 ：{1}列 的数据".format(row, colum))
 30 print("有{0}个训练数据".format(len(list)))
 31 
 32 
 33 #y=w0*x0+w1*x1+w2*x2+w3*x3+w4*x4+w5*x5+w6*x6+w7*x7+w8*x8+b0
 34 #
 35 
 36 alpha=0.0001
 37 b_0=np.random.rand(1,1)
 38 th_0 = np.random.rand(1,1);
 39 th_1 = np.random.rand(1,1);
 40 th_2 = np.random.rand(1,1);
 41 th_3 = np.random.rand(1,1);
 42 th_4=  np.random.rand(1,1);
 43 th_5 = np.random.rand(1,1);
 44 th_6 = np.random.rand(1,1);
 45 th_7 = np.random.rand(1,1);
 46 th_8 = np.random.rand(1,1);
 47 for k in range(1000):
 48     length = len(list)
 49     jtheta = 0
 50     total = 0
 51     sum_total = 0
 52     for id in range(length):
 53         # print("当前序号{0}训练数据".format(id))
 54         xset= np.array(list[id]) #一行 X数值
 55         yset= np.array(result[id]) # 要估计值
 56         total = total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset
 57         # print( "当前误差{0}".format(b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset))
 58         tmpb0 = b_0  - alpha/length*(total)
 59         tmp0 = th_0  -  alpha/length*(total)*xset[0]
 60         tmp1 = th_1  -  alpha/length*(total)*xset[1]
 61         tmp2 = th_2  -  alpha/length*(total)*xset[2]
 62         tmp3 = th_3  -  alpha/length*(total)*xset[3]
 63         tmp4 = th_4  -  alpha/length*(total)*xset[4]
 64         tmp5 = th_5  -  alpha/length*(total)*xset[5]
 65         tmp6 = th_6  -  alpha/length*(total)*xset[6]
 66         tmp7 = th_7  -  alpha/length*(total)*xset[7]
 67         tmp8 = th_8  -  alpha/length*(total)*xset[8]
 68         b_0 = tmpb0
 69         th_0 = tmp0
 70         th_1 = tmp1
 71         th_2 = tmp2
 72         th_3 = tmp3
 73         th_4 = tmp4
 74         th_5 = tmp5
 75         th_6 = tmp6
 76         th_7 = tmp7
 77         th_8 = tmp8
 78         sum_total = sum_total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8] - yset
 79         jtheta_1 = 0.5 * length * math.pow(sum_total,2)
 80         comp = math.fabs(jtheta_1 - jtheta)
 81         if id==length-1:
 82                 print "%10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f \n" %(comp,jtheta * dgree,b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8)
 83         jtheta = jtheta_1
 84 #
 85 print("-训练得到的权值如下--")
 86 print " %10.5f %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f \n" %(b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8)
 87 
 88 #测试训练集
 89 for k in range(len(list)):
 90     xset = np.array(list[k])
 91     nptresult= np.array(result[k])
 92     # print(xset)
 93     # print("预测数据{0}".format( b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]))
 94     # print("真实数据{0}".format(nptresult))
 95     error= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]-nptresult
 96     print("训练集的实际误差{0}".format(error))
 97 
 98 #读取测试集数据
 99 testfilename = 'F:/台湾机器学习/data/test_X.csv'
100 utestfilename = unicode(testfilename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
101 testlist=[]
102 testrow=0
103 testcolum=0;
104 with open(utestfilename, 'r') as f:
105     data = f.readlines()  #dat中所有字符串读入data
106     for line in data:
107         odom = line.split(',')        #将单个数据分隔开存好
108         colum=len(odom)
109         if 'PM2.5'in odom:
110             testlists= map(int, odom[2:11])#第三个开始开始数据  一直取9个数
111             testlist.append(testlists)
112             # print odom
113         testrow=row+1
114 
115 print("测试数据是：{0}行 ：{1}列 的数据".format(testrow, testcolum))
116 print("有{0}个测试数据".format(len(testlist)))
117 print(testlist)
118 
119 #输出最后的测试结果
120 csvfile = file('d:\\csv_result.csv', 'wb')
121 writer = csv.writer(csvfile)
122 writer.writerow(['id', 'value'])
123 for k in range(len(testlist)):
124     id_list=[]
125     xset = np.array(testlist[k])
126     result= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]
127     int_result = int(result)
128     if(int_result<0):
129         int_result=0
130     id_list = [('id_{0}'.format(k), '{0}'.format(int_result))]
131     print(id_list)
132     writer.writerows(id_list)
133 csvfile.close()

又试了试 batch gradual descent，貌似没发现什么新的东西

#y=w0*x0+w1*x1+w2*x2+w3*x3+w4*x4+w5*x5+w6*x6+w7*x7+w8*x8+b0
#
alpha=0.0001
b_0=np.random.rand(1,1)
th = np.random.rand(1,9);
batch=20
for k in range(5000):
    length = len(list)
    jtheta = 0
    total = 0
    sum_total = 0
    count=0
    for j in range(batch): #batch
        # print("当前序号{0}训练数据".format(id))
        xset= np.array(list[j+count*batch]) #一行 X数值
        yset= np.array(result[j+count*batch]) # 要估计值
        total = total+b_0 +np.dot(th,xset)- yset
        # print( "当前误差{0}".format(b_0 +np.dot(th,xset)- yset))
    b_0 = b_0  - alpha/batch*(total)
    th = th  -  alpha/batch*(total)*xset
    count = count +1
    if(count>=len(list)/batch):
      break;
    if(j==batch-1):
        print " %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f \n" %(b_0,th[0][0],th[0][1],th[0][2],th[0][3],th[0][4],th[0][5],th[0][6],th[0][7],th[0][8])

#
print("-训练得到的权值如下--")
print" %10.5f %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f \n" %(b_0,th[0][0],th[0][1],th[0][2],th[0][3],th[0][4],th[0][5],th[0][6],th[0][7],th[0][8])

posted on 2017-04-18 03:15 xxxxxxxx1x2xxxxxxx 阅读(233) 评论(0) 收藏举报

刷新页面返回顶部

yyyyyyyyyyyyyyyyyyyy

公告

机器学习 1 linear regression 作业