一、环境简介

使用anaconda编程环境

安装requests和lxml工具包:pip install requests

                                             pip install lxml

二、Xpath简介

Xpath即为XML路径语言,用来确定XML文档中某部分位置的语言。

Xpath基于XML的树状结构,提供在数据结构树中寻找节点的能力

   

三、爬取豆瓣电影《复仇者联盟4》详情信息

爬虫操作步骤:请求并下载电影页面信息、解析并定位基本信息、保存数据信息

# 导入 requests和lxml模块,requests用来下载网页,lxml.etree用来解析网页
import
requests from lxml import etree url = "https://movie.douban.com/subject/26100958/?from=showing" data = requests.get(url).text s = etree.HTML(data)
# 获取元素的xpath信息并获得文本:file = s.xpath('元素的xpath信息/text()') film
= s.xpath('//*[@id="content"]/h1/span[1]/text()') director = s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()') # 导演 不同导演a[1]\[2],可去掉[x]代表所有导演 # director2 = s.xpath('//*[@id="info"]/span[1]/span[2]/a[2]/text()') # 导演 screenwriter = s.xpath('//*[@id="info"]/span[2]/span[2]/a/text()') actor = s.xpath('//*[@id="info"]/span[3]/span[2]/a/text()') # 主演 不同主演a[x],可去掉[x]代表所有主演 style1 = s.xpath('//*[@id="info"]/span[5]/text()') # 类型 style2 = s.xpath('//*[@id="info"]/span[6]/text()') # 类型 style3 = s.xpath('//*[@id="info"]/span[7]/text()') # 类型 style4 = s.xpath('//*[@id="info"]/span[8]/text()') # 类型 time = s.xpath('//*[@id="info"]/span[15]/text()') # 片长 print('电影名字:',film) print('导演:',director) print('编剧:',screenwriter) print('主演:',actor) print('类型:',style1,style2,style3,style4) print('片长:',time)

 四、爬取豆瓣TOP250的图书

1. 浏览器复制xpath不是完全可靠,发现tbody标签需手动删除,table[1] 中 [x] 可去除,表示所有书籍。

import requests
from lxml import etree
import time
url = "https://book.douban.com/top250"
data = requests.get(url).text
s = etree.HTML(data)
# file = s.xpath('//*[@id="content"]/div/div[1]/div/table[1]/tbody/tr/td[2]/div[1]/a') # tbody去除,否则返回空值
file = s.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/div[1]/a/@title') # table[1] [x]可去除表示所有书籍
for title in file:
    print (title)

2. 爬取当页所有书籍书名、网址、评分、点评人数、一句评语

import requests
from lxml import etree
import time
url = "https://book.douban.com/top250"
data = requests.get(url).text
s = etree.HTML(data)

file = s.xpath('//*[@id="content"]/div/div[1]/div/table') # 防止错位,去除table[*]中[*]包含所有书籍
for div in file:
    title = div.xpath('./tr/td[2]/div[1]/a/@title')[0] # 书名 [0]去除[]
    href = div.xpath('./tr/td[2]/div[1]/a/@href')[0] # 链接
    score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0] 评分
    num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip().strip(')')#strip删除括号和空白符
    scrible = div.xpath('./tr/td[2]/p[2]/span/text()')
    time.sleep(1)  # 防止爬太快ID被封
    print('{} {} {} {} {}'.format(title,href,score,num,scrible[0]))

 

3. 爬取10个个页面共250个书籍数据

import requests
from lxml import etree
import time

for a in range(10):   
    url = "https://book.douban.com/top250?start="+str(a*25) # 共十个页面,共250个数据
    data = requests.get(url).text
    s = etree.HTML(data)
    file = s.xpath('//*[@id="content"]/div/div[1]/div/table')
          
    for div in file:
        title = div.xpath('./tr/td[2]/div[1]/a/@title')[0]
        href = div.xpath('./tr/td[2]/div[1]/a/@href')[0]
        score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
        num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip().strip(')')
        scrible = div.xpath('./tr/td[2]/p[2]/span/text()')
        time.sleep(1)   # 减缓速度 防止封ID
        if len(scrible) > 0:   # 判断书籍是否有一句话评语,有就打印,无则不打印
            print('{} {} {} {} {}'.format(title,href,score,num,scrible[0]))
        else:
            print('{} {} {} {}'.format(title,href,score,num))

五、爬取深圳小猪租房短租

1. 爬取标题信息

import requests
from lxml import etree
import time
url = 'http://sz.xiaozhu.com/'
data = requests.get(url).text

s = etree.HTML(data)
# file = s.xpath('//*[@id="page_list"]/ul/li/div[2]/div[2]/a/span/text()')
file = s.xpath('//*[@id="page_list"]/ul/li')
time.sleep(2)
for div in file:
    title = div.xpath('./div[2]/div/a/span/text()')[0]
    print(title)

2. 爬取一页信息(位置,价格房屋状况,链接、图片)

import requests
from lxml import etree
import time
url = 'http://sz.xiaozhu.com/'
data = requests.get(url).text

s = etree.HTML(data)
# file = s.xpath('//*[@id="page_list"]/ul/li/div[2]/div[2]/a/span/text()')
file = s.xpath('//*[@id="page_list"]/ul/li')
time.sleep(2)
for div in file:
    title = div.xpath('./div[2]/div/a/span/text()')[0]
    price = div.xpath('./div[2]/div[1]/span/i/text()')[0]
    scrible = div.xpath('./div[2]/div[2]/em/text()')[0]
    pic = div.xpath('./a/img/@lazy_src')[0]
    print('{} {} {} {}\n '.format(title,price,scrible,pic))

3. 爬取前三页信息

import requests
from lxml import etree
import time

for a in range(1,4):
    url = 'http://sz.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
    data = requests.get(url).text

    s = etree.HTML(data)
    file = s.xpath('//*[@id="page_list"]/ul/li')
    time.sleep(3)
    
    for div in file:
        title = div.xpath('./div[2]/div/a/span/text()')[0]
        price = div.xpath('./div[2]/div[1]/span/i/text()')[0]
        scrible = div.xpath('./div[2]/div[2]/em/text()')[0].strip()
        pic = div.xpath('./a/img/@lazy_src')[0]
        print('{} {} {}\n {} '.format(title,price,scrible,pic))

 六、数据存储(基于 with open语句)

"""
name: 包含文件名的字符串
mode:文件打开模式(r/只读;w/只写;a/追加到末尾)
encoding:数据编码格式,一般为utf-8或者gbk
file : 在文件中对文件的命名
"""
with open (name,mode,encoding) as file:
    file.write()

 

import requests
from lxml import etree
import time

with open('/Users/cc/Desktop/xiaozhu.txt','w',encoding = 'utf-8') as f: # 保持路径,只写模式,utf-8
    for a in range(1,4):
        url = 'http://sz.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
        data = requests.get(url).text

        s = etree.HTML(data)
        file = s.xpath('//*[@id="page_list"]/ul/li')
        time.sleep(3)

        for div in file:
            title = div.xpath('./div[2]/div/a/span/text()')[0]
            price = div.xpath('./div[2]/div[1]/span/i/text()')[0]
            scrible = div.xpath('./div[2]/div[2]/em/text()')[0].strip()
            pic = div.xpath('./a/img/@lazy_src')[0]
            f.write('{}, {}, {}, {}\n '.format(title,price,scrible,pic))

import requests
from lxml import etree
import time

with open('/Users/cc/Desktop/xiaozhu.csv','w',encoding = 'utf-8') as f: # 存储为.csv格式
    for a in range(1,4):
        url = 'http://sz.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
        data = requests.get(url).text

        s = etree.HTML(data)
        file = s.xpath('//*[@id="page_list"]/ul/li')
        time.sleep(3)

        for div in file:
            title = div.xpath('./div[2]/div/a/span/text()')[0]
            price = div.xpath('./div[2]/div[1]/span/i/text()')[0]
            scrible = div.xpath('./div[2]/div[2]/em/text()')[0].strip()
            pic = div.xpath('./a/img/@lazy_src')[0]
            f.write('{}, {}, {}, {}\n '.format(title,price,scrible,pic))

乱码解决方式:将数据通过记事本另存为ANSI格式,然后通过excel数据→新建查询→文本导入数据

 豆瓣top250书籍数据存储

import requests
from lxml import etree
import time
with open ('/Users/cc/Desktop/top250.csv','w',encoding = 'utf-8') as f:
    for a in range(10):
        url = "https://book.douban.com/top250?start={}".format(a*25)
        data = requests.get(url).text
        s = etree.HTML(data)
        file = s.xpath('//*[@id="content"]/div/div[1]/div/table')
        for div in file:
            title = div.xpath('./tr/td[2]/div[1]/a/@title')[0]
            href = div.xpath('./tr/td[2]/div[1]/a/@href')[0]
            score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
            num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip().strip(')')
            scrible = div.xpath('./tr/td[2]/p[2]/span/text()')
            time.sleep(2)
            if len(scrible) > 0:
                f.write('{}, {}, {}, {}, {}\n'.format(title,href,score,num,scrible[0]))
            else:
                f.write('{}, {}, {}, {}\n'.format(title,href,score,num))