sklearn特征提取

 1 from sklearn.datasets import load_iris
 2 from sklearn.model_selection import train_test_split #集合划分
 3 from sklearn.feature_extraction import DictVectorizer #字典特征抽取
 4 from sklearn.feature_extraction.text import CountVectorizer #根据词频文本特征提取
 5 from sklearn.feature_extraction.text import TfidfVectorizer #根据tf-idf文本特征提取
 6 
 7 import numpy as np
 8 import pandas as pd
 9 import jieba #分词
10 def datasets_demo():
11     iris=load_iris()
12     x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
13     print(x_train,x_train.shape)
14     #    print(iris)
15   #  print(iris['DESCR'])
16     return None
17 
18 def dic_demo():
19     data=[{"city":'北京','temp':100},
20           {"city":'上海','temp':100},
21           {"city":'南京','temp':100}]
22     transfer=DictVectorizer(sparse=False)
23     data_new=transfer.fit_transform(data)
24     print(data_new)
25     print(transfer.get_feature_names())
26     return None
27 def text_demo():
28     text=["life is short,i like like python","life is too long,i dislike python"]
29     transfer=CountVectorizer()
30     text_new=transfer.fit_transform(text)
31     print(transfer.get_feature_names())
32     print(text_new.toarray())
33 def cut_word(text):
34     a=" ".join(list(jieba.cut(text)))
35     print(type(a))
36     return a
37 def count_chinese_demo2():
38     data=["一种还是一种今天很残酷,明天更残酷,后天很美好",
39           "我们看到的从很远星系来的光是在几百万年之前发出的",
40           "如果只从一种方式了解某样事物,你就不会真正了解它"]
41     text=[]
42     #分词
43     for sent in data:
44         str=" ".join(jieba.cut(sent))
45         text.append(str)
46     #字符统计
47     transfer=CountVectorizer(stop_words=["一种","不会"])
48     text_new=transfer.fit_transform(text)
49     print(transfer.get_feature_names())
50     print(text_new.toarray())
51 
52     print(text)
53 def tfidf_demo():
54     data = ["一种还是一种今天很残酷,明天更残酷,后天很美好",
55             "我们看到的从很远星系来的光是在几百万年之前发出的",
56             "如果只从一种方式了解某样事物,你就不会真正了解它"]
57     text = []
58     # 分词
59     for sent in data:
60         str = " ".join(jieba.cut(sent))
61         text.append(str)
62     # 字符统计
63     transfer = TfidfVectorizer(stop_words=["一种", "不会"])
64     text_new = transfer.fit_transform(text)
65     print(transfer.get_feature_names())
66     print(text_new.toarray())
67 
68 if __name__ =='__main__':
69     #datasets_demo()
70     #dic_demo()
71     #text_demo()
72     #print(cut_word("我爱北京天安门"))
73     #count_chinese_demo2()
74     #tfidf_demo()

 

posted @ 2019-10-30 10:54  荼离伤花  阅读(312)  评论(0)    收藏  举报