sklearn特征提取
1 from sklearn.datasets import load_iris 2 from sklearn.model_selection import train_test_split #集合划分 3 from sklearn.feature_extraction import DictVectorizer #字典特征抽取 4 from sklearn.feature_extraction.text import CountVectorizer #根据词频文本特征提取 5 from sklearn.feature_extraction.text import TfidfVectorizer #根据tf-idf文本特征提取 6 7 import numpy as np 8 import pandas as pd 9 import jieba #分词 10 def datasets_demo(): 11 iris=load_iris() 12 x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target,test_size=0.2,random_state=22) 13 print(x_train,x_train.shape) 14 # print(iris) 15 # print(iris['DESCR']) 16 return None 17 18 def dic_demo(): 19 data=[{"city":'北京','temp':100}, 20 {"city":'上海','temp':100}, 21 {"city":'南京','temp':100}] 22 transfer=DictVectorizer(sparse=False) 23 data_new=transfer.fit_transform(data) 24 print(data_new) 25 print(transfer.get_feature_names()) 26 return None 27 def text_demo(): 28 text=["life is short,i like like python","life is too long,i dislike python"] 29 transfer=CountVectorizer() 30 text_new=transfer.fit_transform(text) 31 print(transfer.get_feature_names()) 32 print(text_new.toarray()) 33 def cut_word(text): 34 a=" ".join(list(jieba.cut(text))) 35 print(type(a)) 36 return a 37 def count_chinese_demo2(): 38 data=["一种还是一种今天很残酷,明天更残酷,后天很美好", 39 "我们看到的从很远星系来的光是在几百万年之前发出的", 40 "如果只从一种方式了解某样事物,你就不会真正了解它"] 41 text=[] 42 #分词 43 for sent in data: 44 str=" ".join(jieba.cut(sent)) 45 text.append(str) 46 #字符统计 47 transfer=CountVectorizer(stop_words=["一种","不会"]) 48 text_new=transfer.fit_transform(text) 49 print(transfer.get_feature_names()) 50 print(text_new.toarray()) 51 52 print(text) 53 def tfidf_demo(): 54 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好", 55 "我们看到的从很远星系来的光是在几百万年之前发出的", 56 "如果只从一种方式了解某样事物,你就不会真正了解它"] 57 text = [] 58 # 分词 59 for sent in data: 60 str = " ".join(jieba.cut(sent)) 61 text.append(str) 62 # 字符统计 63 transfer = TfidfVectorizer(stop_words=["一种", "不会"]) 64 text_new = transfer.fit_transform(text) 65 print(transfer.get_feature_names()) 66 print(text_new.toarray()) 67 68 if __name__ =='__main__': 69 #datasets_demo() 70 #dic_demo() 71 #text_demo() 72 #print(cut_word("我爱北京天安门")) 73 #count_chinese_demo2() 74 #tfidf_demo()