import nltk
from nltk.corpus import stopwords
# from nltk.stem.lancaster import LancasterStemmer # 词干化
# ls = LancasterStemmer() ls.stem(word)
from db_process import MyProcess
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
s = 'attention window eyes users: if you are using internet explorer 9 or 10, you may not be able to log in to the chase site or other internet sites., I went to facebook with my students.'
words = nltk.word_tokenize(s) # 分词
#tags = nltk.pos_tag(words) # 显示词性
filter_words = filter(lambda x: x not in english_punctuations and x not in stopwords.words('english'), words)