html内容为table解析
#html
#内容解析
from lxml import etree
import pandas as pd
import re
def get_data(html):
html = etree.HTML(html)
#result = etree.tostring(html)
#print(result)
table = html.xpath('//table[contains(@class,"torrents")]')
# table内容转换成dataframe数据
result =[]
for rows in table[0].xpath('./tr'): #备注因为table有表头,所以从第一行开始抓取数据
#去掉标题行
elements = rows.xpath('./td')
tmp_result = [''.join(ele.xpath('.//text()') ) for ele in elements]
title_id = rows.xpath(".//a[contains(@href,'php?id')]/@href")
try:
title_id = re.search('php\?id=(\d+)',str(title_id)).group(1)
except:
title_id = ''
#title_class = ''.join(rows.xpath(".//a[contains(@href,'?cat=')]/img/@title"))
#print(rows.xpath(".//a[contains(@href,'php?id')]/@title"))
tmp_result.append(title_id)
#tmp_result.append(title_class)
result.append(tmp_result)
labels = ['类型','标题','评论数','存活时间','大小','种子数','下载数','完成数','发布者','标题id']
result
df = pd.DataFrame.from_records(result,columns=labels)
#删除第一行数据
df=df.iloc[1:]
return df
#get_data(html)
存入mongod数据库
import pandas as pd
from pymongo import MongoClient
from sqlalchemy import create_engine
def data_to_dataframe(data):
from pymongo import MongoClient
client = MongoClient('67.216.204.220', 27017)
db = client.pt
table = db.pt_btschool_net_torrents
records = data.to_dict('records')
table.insert_many(records)
return 'success dataframe_to_mongodb '
#data_to_dataframe(df)
html 内容是table的解析办法:
说明:table中tbody是无效的,使用中不需要用它
import pandas as pd
from lxml import html
url = "http://www.uesp.net/wiki/Skyrim:No_Stone_Unturned"
xpath = "//*[@id=\"mw-content-text\"]/table[3]"
tree = html.parse(url)
table = tree.xpath(xpath)[0]
raw_html = html.tostring(table)
dta = pd.read_html(raw_html, header=0)[0]
dta["completed"] = 0
del dta["Map"]
参考地址:https://gist.github.com/jseabold/5892603
案例:
from lxml import etree
import pandas as pd
table = html.xpath('//table[@id="torrenttable"]')[0]
raw_html = etree.tostring(table)
data = pd.read_html(raw_html, header=0)[0]
读取mongodb数据内容
import pandas as pd
from sqlalchemy import create_engine
def read_mongb():
from pymongo import MongoClient
client = MongoClient('67.216.204.220', 27017)
db = client.pt
table = db.pt_btschool_net_torrents
data = pd.DataFrame(pd.DataFrame(list(table.find())))
return data
data = read_mongb()
data.head()
data.columns

浙公网安备 33010602011771号