李宏毅机器学习作业2

尚未完成,先做代码保存用。

# 判断工资是否大于50k
import numpy as np
import pandas as pd


def data_process(data):
    print(data.shape)
    data = np.array(data)
    # 处理y_train的数据,并统计工资超过50k的人数和小于50k的人数
    y_train = data[:, -1]
    for i in range(y_train.shape[0]):  # 超过50k的令为0,小于50k的令为1
        if y_train[i] != ' 50000+.':
            y_train[i] = 0
        else:
            y_train[i] = 1
            pass
        pass
    pass
    number_more_50k = 0
    number_less_50k = 0
    for i in range(y_train.shape[0]):
        if y_train[i] == 1:
            number_more_50k += 1
        else:
            number_less_50k += 1
            pass
        pass
    pass
    # 对于x_train中的数据进行处理
    x_train = data[:, 1:-1]
    for i in range(x_train.shape[0]):
        # 工作属性
        x_train[i, 1] = x_train[i, 1].replace("Private", '0').replace("Not in universe", '1').\
            replace('Self-employed-not incorporated', '2').replace('Local government', '3').\
            replace('Self-employed-incorporated', '4').replace('State government', '5').\
            replace('Federal government', '6').replace('Never worked', '7').replace('Without pay', '8')
        # 受教育程度
        x_train[i, 4] = x_train[i, 4].replace('High school graduate', '0').replace('Some college but no degree', '1').\
            replace('Bachelors degree(BA AB BS)', '2').replace('Masters degree(MA MS MEng MEd MSW MBA)', '3').\
            replace('Children', '4').replace('10th grade', '5').replace('7th and 8th grade', '6').\
            replace('11th grade', '7').replace('Associates degree-occup /vocational', '8').\
            replace('Associates degree-academic program', '9').replace('9th grade', '10').\
            replace('Prof school degree (MD DDS DVM LLB JD)', '11').replace('5th or 6th grade', '12').\
            replace('Doctorate degree(PhD EdD)', '13').replace('12th grade no diploma', '14').\
            replace('1st 2nd 3rd or 4th grade', '15').replace('Less than 1st grade', '16')
        # 婚姻情况
        x_train[i, 7] = x_train[i, 7].replace('Married-civilian spouse present', '0').replace('Never married', '1').\
            replace('Divorced', '2').replace('Widowed', '3').replace('Separated', '4').\
            replace('Married-spouse absent', '5').replace('Married-A F spouse present', '6')
        # 工作类型
        x_train[i, 8] = x_train[i, 8].replace("Not in universe or children", '0').replace('Retail trade', '1').\
            replace('Manufacturing-durable goods', '2').replace('Education', '3').\
            replace('Manufacturing-nondurable goods', '4').replace('Finance insurance and real estate', '5').\
            replace('Construction', '6').replace('Business and repair services', '7').\
            replace('Other professional services', '8').replace('Public administration', '9').\
            replace('Medical except hospital', '10').replace('Transportation', '11').\
            replace('Hospital services', '12').replace('Wholesale trade', '13').replace('Agriculture', '14').\
            replace('Personal services except private HH', '15').replace('Social services', '16').\
            replace('Entertainment', '17').replace('Communications', '18').\
            replace('Utilities and sanitary services', '19').replace('Private household services', '20').\
            replace('Mining', '21').replace('Forestry and fisheries', '22').replace('Armed Forces', '23')
        # 工作职位
        x_train[i, 9] = x_train[i, 9].replace('Not in universe', '0').replace('Professional specialty', '1').\
            replace('Executive admin and managerial', '2').replace('Adm support including clerical', '3').\
            replace('Sales', '4').replace('Other service', '5').replace('Precision production craft & repair', '6').\
            replace('Machine operators assmblrs & inspctrs', '7').replace('Transportation and material moving', '8').\
            replace('Handlers equip cleaners etc', '9').replace('Technicians and related support', '10').\
            replace('Farming forestry and fishing', '11').replace('Protective services', '12').\
            replace('Private household services', '13').replace('Armed Forces', '14')
        # 人种
        x_train[i, 10] = x_train[i, 10].replace('White', '0').replace('Black', '1').\
            replace('Asian or Pacific Islander', '2').replace('Other', '3').replace('Amer Indian Aleut or Eskimo','4')
        # 性别
        x_train[i, 12] = x_train[i, 12].replace('Male', '0').replace('Female', '1')

    # 创建一个新的矩阵,将需要的参数放进去
    x_train_now = np.arange(12*54256).reshape(54256, 12)
    x_train_now[:, 0] = x_train[:, 0]
    x_train_now[:, 1] = x_train[:, 1]
    x_train_now[:, 2] = x_train[:, 4]
    x_train_now[:, 3] = x_train[:, 7]
    x_train_now[:, 4] = x_train[:, 8]
    x_train_now[:, 5] = x_train[:, 9]
    x_train_now[:, 6] = x_train[:, 10]
    x_train_now[:, 7] = x_train[:, 12]
    x_train_now[:, 8] = x_train[:, 16]
    x_train_now[:, 9] = x_train[:, 17]
    x_train_now[:, 10] = x_train[:, 18]
    x_train_now[:, 11] = x_train[:, 38]
    x_train_now.astype('float')
    print(x_train_now)
    return x_train_now, y_train, number_more_50k, number_less_50k


def train(x_train, y_train, epoch, number_more_50k, number_less_50k):
    # 初始化各类参数
    learning_rate = 0.001
    loss = 0
    bias = 0
    weight = np.zeros(54256*12).reshape(54256, 12)
    pc_1 = number_more_50k / (number_less_50k + number_more_50k)  # 工资大于50k的概率
    pc_2 = number_less_50k / (number_less_50k + number_more_50k)  # 工资小于50k的概率


def valid(x_train_valid, y_train_valid):
    loss = 0


def main():
    data = pd.read_csv("E:/BaiduNetdiskDownload/cs_data/hw2/hw2/train.csv")
    epoch = 2000
    x_data, y_data, number_more_50k, number_less_50k = data_process(data)
    print(x_data.shape)  # 目前有12个输入的参数,一个参数有54256组数据
    x_train = x_data[:48228, ...]  # 训练集48228个
    x_valid = x_data[48228:, ...]
    y_train = y_data[:48228, ...]
    y_valid = y_data[48228:, ...]  # 验证集6028个
    train(x_train, y_train, epoch, number_more_50k, number_less_50k)
    valid(x_valid, y_valid)


if __name__ == '__main__':
    main()
posted @ 2021-09-23 20:16  WangSir_Code  阅读(193)  评论(0)    收藏  举报