爬虫期末考试题大题

request库的函数和参数
BeautifulSoup的5个元素
bs4库的捷信器lxml
标签数的图,上行,下行、平行
信息的三种形式如何来标记
正则表达式的表
re库的6个函数
re库怎么等价成表达式
scrapy的框架,创建(genspider)一个爬虫的命令和运行(crawl)一个爬虫
国内的电话号码正则表达式

读写csv文件

//方法一:
import re
import requests
import csv
r=requests.get("http://python123.io/ws/demo.html")
a=r.text
b=re.compile(r'[1-5]\d{5}')
c=b.findall(a)
l1=[["序号","网址"]]
with open("e://guoli/1.csv","w",newline="")as f1:
    x1=csv.writer(f1)
    x1.writerows(l1)
with open("e://guoli/1.csv","a",newline="")as f2:
    for i in range(0,len(c)):
          j=[[i+1,c[i]]]
          x2=csv.writer(f2)
          x2.writerows(j)

//方法二
import re
import requests
import csv
r=requests.get("http://python123.io/ws/demo.html")
a=r.text
b=re.compile(r'[1-5]\d{5}')
c=b.finditer(a)
l1=[["序号","网址"]]
with open("e://guoli/2.csv","w",newline="")as f1:
    x1=csv.writer(f1)
    x1.writerows(l1)
t=1
with open("e://guoli/2.csv","a",newline="")as f2:
    for i in c:
        j=[[t,i.group()]]
        x2=csv.writer(f2)
        x2.writerows(j)
        t=t+1

搜索网站的源码的标签

//方法一
import requests
r=requests.get('https://s.taobao.com/search?q=书包')
a=r.text
from bs4 import BeautifulSoup
b=BeautifulSoup(a,'html.parser')
import re
e="{:4}\t{:4}"
print(e.format("序号","标签"))
x=1
for j in b.find_all(True):
    if j.name!=None:
        print(e.format(x,j.name))
        x+=1

//方法二
import requests
r=requests.get('https://s.taobao.com/search?q=书包')
a=r.text
from bs4 import BeautifulSoup
b=BeautifulSoup(a,'html.parser')
import re
e="{:4}\t{:4}"
print(e.format("序号","标签"))
x=1
for j in b.descendants:    #此处不同
    if j.name!=None:
        print(e.format(x,j.name))
        x+=1

获取网页源码数据

import requests
a='http://www.python123.io/ws/demo.html'
try:
    b=requests.get(a,timeout=30)
    b.raise_for_status()
    b.encoding=b.apparent_encoding
except:
    print("获取数据失败!")
with open('e://guoli/3.txt','w')as f:
    f.write("获取数据如下:")
    f.write(b.text)
    f.close()
from bs4 import BeautifulSoup
c=b.text
d=BeautifulSoup(c,'html.parser')
with open('e://guoli/3.txt','a')as f:
    f.write("\n整理后的数据如下:\n")
    f.write(d.prettify())
f.close

Requests库的2主要对象的属性

Requests库的7个主要方法

爬取网页的通用代码框架

爬取图片

import requests
import os
url='https://b0.bdstatic.com/f656c7b03072e3675a6b51ae07cb5626.jpg@h_1280'
p2='e://guoli/'
p3=p2+url.split('/')[-1]+'.jpg'
try:
    if not os.path.exists(p2):
        os.mkdir(p2)
    if not os.path.exists(p3):
        r=requests.get(url)
        r.encoding=r.apparent_encoding
        a=r.content
        with open(p3,'wb')as f:
            f.write(a)
            f.close()
            print("文件保存成功")
    else:
        print('文件已经存在!')
except:
    print("爬取失败")

用Try和Finally爬取京东

import requests
a='https://www.jd.com'
try:
    r=requests.get(a)
#   r=requests.request('get',a)
    print(r.status_code)
    r.encoding=r.apparent_encoding
    print(r.text[:500])
except:
    print('爬取失败!')
finally:
    print('考试成功')

posted @ 2024-12-23 09:38  果粒就要果粒多  阅读(42)  评论(0)    收藏  举报