html内容为table解析
#html
#内容解析
from lxml import etree
import pandas as pd
import re
def get_data(html):
    html = etree.HTML(html)
    #result = etree.tostring(html)
    #print(result)
    table = html.xpath('//table[contains(@class,"torrents")]')
    # table内容转换成dataframe数据
    result =[]
    for rows in table[0].xpath('./tr'): #备注因为table有表头,所以从第一行开始抓取数据
        #去掉标题行
        elements = rows.xpath('./td')
        tmp_result = [''.join(ele.xpath('.//text()') ) for ele in elements]
        title_id = rows.xpath(".//a[contains(@href,'php?id')]/@href")
        try:
            title_id = re.search('php\?id=(\d+)',str(title_id)).group(1)    
        except:
            title_id = ''
        #title_class  = ''.join(rows.xpath(".//a[contains(@href,'?cat=')]/img/@title"))
        #print(rows.xpath(".//a[contains(@href,'php?id')]/@title"))
        tmp_result.append(title_id)
        #tmp_result.append(title_class)    
        result.append(tmp_result)
    labels = ['类型','标题','评论数','存活时间','大小','种子数','下载数','完成数','发布者','标题id']
    result
    df = pd.DataFrame.from_records(result,columns=labels)
    #删除第一行数据
    df=df.iloc[1:]
    return df
#get_data(html)
存入mongod数据库
import pandas as pd
from pymongo import MongoClient
from sqlalchemy import create_engine
def data_to_dataframe(data):
    from pymongo import MongoClient
    client = MongoClient('67.216.204.220', 27017)
    db = client.pt
    table = db.pt_btschool_net_torrents
    records = data.to_dict('records')
    table.insert_many(records)
    return 'success dataframe_to_mongodb '
#data_to_dataframe(df)
html 内容是table的解析办法:
说明:table中tbody是无效的,使用中不需要用它
import pandas as pd
from lxml import html
url = "http://www.uesp.net/wiki/Skyrim:No_Stone_Unturned"
xpath = "//*[@id=\"mw-content-text\"]/table[3]"
tree = html.parse(url)
table = tree.xpath(xpath)[0]
raw_html = html.tostring(table)
dta = pd.read_html(raw_html, header=0)[0]
dta["completed"] = 0
del dta["Map"]
参考地址:https://gist.github.com/jseabold/5892603
案例:
from lxml import etree
import pandas as pd
table = html.xpath('//table[@id="torrenttable"]')[0]
raw_html = etree.tostring(table)
data = pd.read_html(raw_html, header=0)[0]
读取mongodb数据内容
import pandas as pd
from sqlalchemy import create_engine
def read_mongb():
    from pymongo import MongoClient
    client = MongoClient('67.216.204.220', 27017)
    db = client.pt
    table = db.pt_btschool_net_torrents
    data = pd.DataFrame(pd.DataFrame(list(table.find())))
    return data
data = read_mongb()
data.head()
data.columns

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号