爬虫的步骤:
1)使用python获得url的源码(向服务器发送请求)
2)获得response的响应对象,获得响应的源码
3)解析源码(正则表达式),获得需要抓取的数据
4)存储爬取的资源(可写入到文件中,也可以写入到数据库中)
1 from urllib.request import urlopen
2 import re
3 url = "http://www.lagou.com"
4 response = urlopen(url)
5 # print(response)
6
7 # read() 获得response对象的源码信息
8 # print(response.read())
9
10 # decode将字节转换成字符串
11 # print(response.read().decode())
12
13 # 解析源码:爬取拉勾网的所有链接
14 html = response.read().decode()
15
16 # r代表字符串以原样输出,忽略转义字符
17 # res_url = r"<a.*?href=\".*?\""
18 res_url = r"<a.*?href=\"(http.*?)\""
19
20 # re.findall(正则表达式,待匹配字符串)
21 urls = re.findall(res_url, html)
22
23 # for i in urls:
24 # print(i)
25 # 存储到txt中
26 # with open("lagou_urls.txt", "wt") as f:
27 # for i in urls:
28 # f.write(i+"\n")
29
30 # 存储到csv中
31 import csv
32 with open("lagou_urls.csv", "wt", newline="") as f:
33 writer = csv.writer(f)
34 for i in urls:
35 writer.writerow([i])