Pyhon3爬虫(1)

简单爬虫程序:获取百度页面所有的HTML元素

from urllib import request

url = "http://www.baidu.com"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = request.Request(url, headers=headers)
page_info = request.urlopen(url).read().decode("utf-8")
print(page_info)

引入BeautifulSoup模块,通过正则表达式获取cnblog博客标题

from urllib import request
from bs4 import BeautifulSoup

url = r'https://www.cnblogs.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = request.Request(url, headers=headers)
page_info = request.urlopen(page).read().decode("utf-8")

soup = BeautifulSoup(page_info, "html.parser")
titles = soup.find_all("a", "titlelnk")
for title in titles:
    pint(title.string)

获取知乎话题图片地址并下载

import time
from urllib import request
from bs4 import BeautifulSoup
import re

url = r'https://www.zhihu.com/question/22918070'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = request.Request(url, headers=headers)
page_info = request.urlopen(page).read().decode("utf-8")
soup = BeautifulSoup(page_info, "html.parser")

links = soup.find_all("img", "origin_image zh-lightbox-thumb", src=re.compile(r'.jpg$'))
local_path = r'E:\pic'

for link in links:
    print(link.attrs["src"])
    request.urlretrieve(link.attrs["src"], local_path + r'\%s.jpg' % time.time())

 

posted @ 2018-04-22 16:02  背向我煮面  阅读(115)  评论(0编辑  收藏  举报