Python数据分析8-----网页文本处理
1、去除网页的标签,如<br/>
from bs4 import BeautifulrSoup preData=BeautifulSoup(data,'html.parser').get_text()
2、将标点符号等去掉,用正则表达式。
import re
#表示将data中的除了大小写字母之外的符号换成空格 preData=re.sub(r'[^a-zA-Z]',' ',data)
去除特殊符号:
#two commom ways to clean data def cleaner(word): word = re.sub(r'\#\.', '', word) word = re.sub(r'\n', '', word) word = re.sub(r',', '', word) word = re.sub(r'\-', ' ', word) word = re.sub(r'\.', '', word) word = re.sub(r'\\', ' ', word) word = re.sub(r'\\x\.+', '', word) word = re.sub(r'\d', '', word) word = re.sub(r'^_.', '', word) word = re.sub(r'_', ' ', word) word = re.sub(r'^ ', '', word) word = re.sub(r' $', '', word) word = re.sub(r'\?', '', word) word = re.sub(r'é', '', word) word = re.sub(r'§', '', word) word = re.sub(r'¦', '', word) word = re.sub(r'æ', '', word) word = re.sub(r'\d+', '', word) word = re.sub('(.*?)\d+(.*?)', '', word) return word.lower() def hashing(word): word = re.sub(r'ain$', r'ein', word) word = re.sub(r'ai', r'ae', word) word = re.sub(r'ay$', r'e', word) word = re.sub(r'ey$', r'e', word) word = re.sub(r'ie$', r'y', word) word = re.sub(r'^es', r'is', word) word = re.sub(r'a+', r'a', word) word = re.sub(r'j+', r'j', word) word = re.sub(r'd+', r'd', word) word = re.sub(r'u', r'o', word) word = re.sub(r'o+', r'o', word) word = re.sub(r'ee+', r'i', word) if not re.match(r'ar', word): word = re.sub(r'ar', r'r', word) word = re.sub(r'iy+', r'i', word) word = re.sub(r'ih+', r'eh', word) word = re.sub(r's+', r's', word) if re.search(r'[rst]y', 'word') and word[-1] != 'y': word = re.sub(r'y', r'i', word) if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word): word = re.sub(r'i$', r'y', word) if re.search(r'[acefghijlmnoqrstuvwxyz]h', word): word = re.sub(r'h', '', word) word = re.sub(r'k', r'q', word) return word def array_cleaner(array): X = [] for sentence in array: clean_sentence = '' words = sentence.split(' ') for word in words: clean_sentence = clean_sentence +' '+ cleaner(word) X.append(clean_sentence) return X X_train = array_cleaner(X_train)
3、将文本中的单词小写化,并将data用空格分开
words=data.lower().split()
4、去掉停用词
#可以自己下载停用词 #nltk.download() words_notstop=[w for w in words if w not in stopwords]
5、将所有的词连接成一个句子
sentence=' '.join(words)
6、把空格前缀去除
train_data['review'] = train_data['review'].str.strip()
7、删除短词,删除句子中词语长度小于3的词,如haa,hi等无意义的词
##删除短单词 train_data['review'] = train_data['review'].apply(lambda x:' '.join([w for w in x.split() if len(w) > 3]))
8、分词
##分词 train_data['review'] = train_data['review'].str.split()
9、提取词干
##提取词干,即基于规则从单词中去除后缀的过程。例如,play,player,played,plays,playing都是play的变种。 from nltk.stem.porter import * stemmer =PorterStemmer() train_data['review'] = train_data['review'].apply(lambda x: [stemmer.stem(i) for i in x])