1 from sklearn.datasets import load_iris
2 from sklearn.model_selection import train_test_split #集合划分
3 from sklearn.feature_extraction import DictVectorizer #字典特征抽取
4 from sklearn.feature_extraction.text import CountVectorizer #根据词频文本特征提取
5 from sklearn.feature_extraction.text import TfidfVectorizer #根据tf-idf文本特征提取
6
7 import numpy as np
8 import pandas as pd
9 import jieba #分词
10 def datasets_demo():
11 iris=load_iris()
12 x_train,x_test,y_train,y_test= train_test_split(iris.data,iris.target,test_size=0.2,random_state=22)
13 print(x_train,x_train.shape)
14 # print(iris)
15 # print(iris['DESCR'])
16 return None
17
18 def dic_demo():
19 data=[{"city":'北京','temp':100},
20 {"city":'上海','temp':100},
21 {"city":'南京','temp':100}]
22 transfer=DictVectorizer(sparse=False)
23 data_new=transfer.fit_transform(data)
24 print(data_new)
25 print(transfer.get_feature_names())
26 return None
27 def text_demo():
28 text=["life is short,i like like python","life is too long,i dislike python"]
29 transfer=CountVectorizer()
30 text_new=transfer.fit_transform(text)
31 print(transfer.get_feature_names())
32 print(text_new.toarray())
33 def cut_word(text):
34 a=" ".join(list(jieba.cut(text)))
35 print(type(a))
36 return a
37 def count_chinese_demo2():
38 data=["一种还是一种今天很残酷,明天更残酷,后天很美好",
39 "我们看到的从很远星系来的光是在几百万年之前发出的",
40 "如果只从一种方式了解某样事物,你就不会真正了解它"]
41 text=[]
42 #分词
43 for sent in data:
44 str=" ".join(jieba.cut(sent))
45 text.append(str)
46 #字符统计
47 transfer=CountVectorizer(stop_words=["一种","不会"])
48 text_new=transfer.fit_transform(text)
49 print(transfer.get_feature_names())
50 print(text_new.toarray())
51
52 print(text)
53 def tfidf_demo():
54 data = ["一种还是一种今天很残酷,明天更残酷,后天很美好",
55 "我们看到的从很远星系来的光是在几百万年之前发出的",
56 "如果只从一种方式了解某样事物,你就不会真正了解它"]
57 text = []
58 # 分词
59 for sent in data:
60 str = " ".join(jieba.cut(sent))
61 text.append(str)
62 # 字符统计
63 transfer = TfidfVectorizer(stop_words=["一种", "不会"])
64 text_new = transfer.fit_transform(text)
65 print(transfer.get_feature_names())
66 print(text_new.toarray())
67
68 if __name__ =='__main__':
69 #datasets_demo()
70 #dic_demo()
71 #text_demo()
72 #print(cut_word("我爱北京天安门"))
73 #count_chinese_demo2()
74 #tfidf_demo()