学习进度02
import requests
from lxml import html
etree = html.etree
# 请求头 网站url
url = 'https://movie.douban.com/cinema/nowplaying/langfang/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'Referer': 'https://movie.douban.com/'
}
resp = requests.get(url,headers=headers)
# text str content bytes
# text 解码过的
# print(resp.content.decode('utf-8'))
# print(resp.text)
# 转化为html 对象 <class 'lxml.etree._Element'>
html = etree.HTML(resp.text)
print(type(html))
ul = html.xpath("//ul[@class='lists']")[0]
# print(ul)
# print(etree.tostring(ul,encoding='utf-8').decode('utf-8')
lis = ul.xpath("./li")
# print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
for li in lis:
name = li.xpath("@data-title")
print(name)
img = li.xpath(".//img/@src")
print(img)
爬取豆瓣电影的在映电影名称和其海报url。
了解了正则表达式和lxml和bs4之间的区别和优缺点。
浙公网安备 33010602011771号