机器学习(1):线性回归和逻辑回归

线性回归,逻辑回归

纪要:

逻辑回归实现代码(C++):

/*********

logistic回归(c++) by 姜富春

**********/

#include<iostream>

#include<fstream>

#include<vector>

#include<sstream>

#include<cmath>

using namespace std;

struct Data{

    vector<int> features;

    int cls;

    Data(vector<int> f, int c) :features(f), cls(c){



    }

};

struct Param{

    vector<double> w;

    double d;

    Param(vector<double> w1, double d1) :w(w1), d(d1){};

    Param() :w(vector<double>()), d(0.0){}

};

class Logistic{

public:

    Logistic(){

        //载入traindata文件构造dataSet;

        loadDataSet(dataSet);

        //初始化Param,w的长度与数据特征的长度相同,初值为0.0。d的初值也为0.0

        vector<double> pw(dataSet[0].features.size(), 0.0);

        Param pt(pw, 0.0);

        param = pt;



    };

    void loadDataSet(vector<Data>& ds, string dataFile = "./traindata.txt"){

        ifstream fin(dataFile.c_str());

        if (!fin){

            cout << "文件打开失败" << endl;

            exit(0);

        }

        while (fin){

            string line;

            getline(fin, line);

            if (line.size()>3){

                stringstream sin(line);

                int t;

                sin >> t;

                vector<int> fea;

                while (sin){

                    char c = sin.peek();

                    if (int(c) != -1){

                        sin >> t;

                        fea.push_back(t);

                    }



                }

                int cl = fea.back();

                fea.pop_back();

                ds.push_back(Data(fea, cl));

            }

        }

    }



    void displayDataSet(){

        for (int i = 0; i<dataSet.size(); i++){

            for (int j = 0; j<dataSet[i].features.size(); j++){

                cout << dataSet[i].features[j] << " ";

            }

            cout << " 分类:" << dataSet[i].cls;

            cout << endl;

        }

    }

    void logisticRegression(){

        //由目标函数为最大似然,因此最终求得的是目标函数的最大值,

        //因此迭代过程是梯度上升,而非梯度下降

        double lamda = 0.1;//梯度下降的步长

        double delta = 0.0001;//结束迭代的阈值

        //目标函数的值

        double objLw = Lw(param);

        //cout<<objLw<<endl;

        Param tpa(param.w, param.d);

        gradient(lamda);

        double newObjLw = Lw(param);

        int iter = 0;

        cout << "初始:" << endl;

        displayIterProcess(iter, objLw, newObjLw, 1);

        while (fabs(newObjLw - objLw)>delta || !samewb(tpa, param, delta)){

            objLw = newObjLw;

            tpa = Param(param.w, param.d);

            gradient(lamda);

            newObjLw = Lw(param);

            ++iter;

            displayIterProcess(iter, objLw, newObjLw, 5);

        }

        cout << "迭代结束共迭代" << iter << "" << endl;

        displayIterProcess(iter, objLw, newObjLw, 1);



    }

    bool samewb(const Param &tparam, const Param& param, double delta){

        for (int i = 0; i<tparam.w.size(); i++){

            if (fabs(tparam.w[i] - param.w[i])>delta){

                return false;

            }

        }

        if (fabs(tparam.d - param.d)>delta){

            return false;

        }

        return true;

    }

    void displayIterProcess(int iter, double objLw, double newObjLw, int mod){

        //每mod步打印一次迭代过程

        if (iter%mod == 0){

            cout << "迭代" << iter << ":目标函数值【" << newObjLw << "】,两次迭代目标函数差值【 " << (newObjLw - objLw) << "" << endl;

            cout << "模型参数:";

            for (int i = 0; i<param.w.size(); i++){

                cout << param.w[i] << " ";

            }

            cout << param.d << endl << endl;

        }



    }

    //梯度上升更新w和b

    void gradient(double lam){

        for (int i = 0; i<param.w.size(); i++){

            double tmp = 0.0L;//保存梯度上升过程的中间值

            for (int j = 0; j<dataSet.size(); j++){

                tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*dataSet[j].features[i] * lam;

            }

            param.w[i] += (tmp);

        }

        double tmp = 0.0L;

        for (int j = 0; j<dataSet.size(); j++){

            tmp += (dataSet[j].cls - logiFun(param, dataSet[j]))*lam;

        }

        param.d += tmp;



    }

    //计算logistic函数的值,即f(x)=exp(wx)/(1+exp(wx)),该表达式在求解梯度过程中出现,

    //因此计算这个值是为了辅助梯度上升计算过程

    inline double logiFun(const Param &p, const Data &d){

        double inner = innerWX(p, d);

        double le = exp(inner) / (1 + exp(inner));

        return le;

    }

    //计算对数似然函数的值

    double Lw(Param p){

        double l = 0.0L;

        for (int i = 0; i<dataSet.size(); i++){

            double inner = innerWX(p, dataSet[i]);

            l += (dataSet[i].cls*inner - (log10(1 + exp(inner))));

            //cout<<"l="<<l<<endl;

        }



        return l;

    }

    //计算wx+b的值

    inline double innerWX(const Param &p, const Data &data){

        if (p.w.size() != data.features.size()){

            cout << "参数与实例的维度不匹配,不能进行内积计算" << endl;

            exit(0);

        }

        double innerP = 0.0L;

        for (int i = 0; i<p.w.size(); i++){

            innerP += (p.w[i] * data.features[i]);

        }

        innerP += p.d;

        return innerP;

    }

    //给定测试集,预测分类

    void predictClass(){

        vector<Data> testDataSet;

        loadDataSet(testDataSet, "./testdata.txt");

        /*******************

        分别计算

        P(Y=1|x)=exp(w.x)/(1+exp(w.x))

        和

        P(Y=0|x)=1/(1+exp(w.x))

        然后取值大的作为x的分类

        *******************/

        cout << endl << "预测分类:" << endl;

        for (int i = 0; i<testDataSet.size(); i++){

            double py1 = 0.0L;

            double py0 = 0.0L;

            double inner = innerWX(param, testDataSet[i]);

            py1 = exp(inner) / (1 + exp(inner));

            py0 = 1 - py1;

            cout << "实例: ";

            for (int j = 0; j<testDataSet[i].features.size(); j++){

                cout << testDataSet[i].features[j] << " ";

            }

            cout << "标记分类【" << testDataSet[i].cls << "】,";

            if (py1 >= py0){



                cout << "预测分类【" << 1 << "" << endl;

            }
            else{

                cout << "预测分类【" << 0 << "" << endl;

            }

        }

    }

private:

    vector<Data> dataSet;

    Param param;

};

int main(){

    Logistic logist;

    //logist.displayDataSet();

    logist.logisticRegression();

    logist.predictClass();

    system("pause");

    return 0;

}

 其中  testdata.txt,保存测试数据;

         traindata.txt保存训练数据;

         logistic.cpp是代码源文件。三个文件保存在同一目录下。

数据的格式如下:

   10009 1 0 0 1 0 1  

   10025  0 0 1 2 0 0  

   20035  0 0 1 0 0 1  

     20053  1 0 0 0 0 0

每行有7个列值,第一列是一个ID号,在具体操作中,忽略该列。之后的5列,每一个都表示一个特征的取值;最后一列是分类标记(0或1)。

 

 

只有实践才能深刻理解。(实践出真知)

 参考:

http://www.cnblogs.com/tornadomeet/p/3395593.html

http://www.cnblogs.com/jfcspring/p/3512356.html

 

posted @ 2016-11-01 22:21  静悟生慧  阅读(439)  评论(0编辑  收藏  举报