python( phantomJS)

#Author:Mini
#!/usr/bin/env python
from selenium import webdriver
import time
import re
from lxml import etree
bs=webdriver.PhantomJS()
time.sleep(3)
url="http://s.weibo.com/weibo/Galaxy%2520Macau?Refer=sina_index"
bs.get(url)
bs.get_screenshot_as_file("E:/m/others/text.png")
data=bs.page_source
f1=open("E:/m/others/test.html","wb")
f1.write(data.encode("utf-8"))
#pat1="<title>(.*?)</title>"
#title=re.compile(pat1).findall(dara)
#print(tittle)
#如何在phantomJS he ullib中使用xpath
#将data转化为tree,再用xpath
"""
edata=etree.HTML(data)
title2=edata.xpath("/html/head/title/text()")
print(title2)
"""
#提取微博发布者
patnickname='nick-name="(.*?)"'
nickname=re.compile(patnickname).findall(data)
print(nickname)
#去掉"em"
patcontent='<em class="red">*?</em>'
con1=re.compile(patcontent)
dataem=re.sub(con1,"",data)
patpic='<img.*?>'
pic1=re.compile(patpic)
datapic=re.sub(pic1,"",dataem)
#微博内容
patcontent1='<p class="comment_txt" .*?>(.*?)<a'
content1=re.compile(patcontent1,re.S).findall(datapic)
print(content1)
f1.close()
bs.quit()
#文本(情感分析)
#文本分类
#加载文本
#将文本转化为特征矩阵
#构建算法
#分好训练数据和测试数据
#对数据进行训练
#对数据进行预测
posted @ 2017-10-31 13:32  兔子的尾巴_Mini  阅读(1174)  评论(0)    收藏  举报