python爬虫糗事百科

import re
from lxml import etree
import requests
from bs4 import BeautifulSoup
import requests
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
    'RequestURL': 'https://eclick.baidu.com/fp.htm?br=2&fp=8D1371255901FBD7974323B7D8E17C98&fp2=E8ECE829116D278272FB03F89C616E7E&ci=&bi=&im=0&wf=0&ct=2011&bp=&m=&t=0&ft=&_=1606374621886',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
proxies = {
    'http': '124.205.155.147:9090'
}
rep = requests.get(url, headers=headers, proxies=proxies)
html = etree.HTML(rep.content.decode('utf-8'))
# print(html.xpath("//div[contains(@class,'article block untagged mb15')][1]//div[@class='content']/span")[0].text)
# print(div_tags)
print("==================男生数据=====================")
div_tag = html.xpath("//div[contains(@class,'article block untagged mb15')]")
for i in div_tag:
    t = i.xpath(".//div[@class='articleGender manIcon']")
    temp_xpath = i.xpath(".//div[@class='content']/span")
    name_xpath = i.xpath(".//h2")
    ping_xpath = i.xpath(".//div[@class='stats']//span[@class='stats-vote']//i[@class='number']")
    pingshus_xpath = i.xpath(".//div[@class='stats']//span[@class='stats-comments']//i[@class='number']")
    img_xpath = i.xpath(".//div[@class='thumb']//a//img")
    if name_xpath:   
        print("\n"+"姓名:"+name_xpath[0].text.replace('\n', ''))
    if t:
        print("男年龄:"+t[0].text)
    if temp_xpath:
        print("内容:"+temp_xpath[0].text.replace('\n', ''))
    if ping_xpath:
        print("好笑:"+ping_xpath[0].text)
    if pingshus_xpath:
        print("评论数:"+pingshus_xpath[0].text)
    if img_xpath:
        print("图片:"+"http:"+img_xpath[0].attrib.get('src'))

print("\n")
print("==================女生数据=====================")
div_tag = html.xpath("//div[contains(@class,'article block untagged mb15')]")
for i in div_tag:
    t2 = i.xpath(".//div[@class='articleGender womenIcon']")
    temp_xpath = i.xpath(".//div[@class='content']/span")
    name_xpath = i.xpath(".//h2")
    ping_xpath2 = i.xpath(".//div[@class='stats']//span[@class='stats-vote']//i[@class='number']")
    pingshus2_xpath = i.xpath(".//div[@class='stats']//span[@class='stats-comments']//i[@class='number']")
    img_xpath2 = i.xpath(".//div[@class='thumb']//a//img")
    if name_xpath:          
        print('\n'+"姓名:"+name_xpath[0].text.replace('\n', ''))
    if t2:
        print("女年龄:"+t2[0].text)
    if temp_xpath:
        print("内容:"+temp_xpath[0].text.replace('\n', ''))
    if ping_xpath2:
        print("好笑:"+ping_xpath2[0].text)
    if pingshus2_xpath:
        print("评论数:"+pingshus2_xpath[0].text)
    if img_xpath2:
        print("图片:http:"+img_xpath2[0].attrib.get('src'))

(仅供学习参考)

posted @ 2021-03-15 15:05  outsider078  阅读(77)  评论(0)    收藏  举报