数据采集第三次作业
作业1
代码
展开
import os
import random
import threading
import time
from queue import Queue
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
class BaseCrawler:
def __init__(self,save_dir='./images'):
self.save_dir = save_dir
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
def get(self,url):
try:
response=requests.get(url,timeout=10)
response.raise_for_status()
html = response.text
return html
except Exception as e:
print(e)
def to_links(self,html):
links=[]
soup = BeautifulSoup(html, 'lxml')
imgs = soup.find_all('img')
for img in imgs:
src = img.get('src')
if src.startswith('//'):
src='https:'+src
if not src.startswith('http'):
continue
links.append(src)
return links
def crawl(self,url):
for link in self.to_links(self.get(url)):
self.download(link)
def download(self,url):
try:
img_path=url.split('/')[-1]
with open(os.path.join(self.save_dir,img_path),'wb') as f:
f.write(requests.get(url).content)
print('downloaded',url)
return True
except Exception as e:
print('下载失败',e)
return False
class MultiCrawler(BaseCrawler):
def __init__(self, urls=None, save_dir='./images', max_page=51, max_num=151):
super().__init__(save_dir)
if urls is None:
urls = []
self.max_page = max_page
self.max_num = max_num
self.downloaded_count = 0
self.downloaded_count_lock = threading.Lock()
def worker(self,page_num):
time.sleep(random.uniform(1,3))
url=f"https://search.dangdang.com/?key=%CA%E9%B0%FC&category_id=10009684&page_index={page_num}#J_tab"
links=self.to_links(self.get(url))
with self.downloaded_count_lock:
if self.downloaded_count >= self.max_num:
return
# print(page_num,len(links))
for link in links:
with self.downloaded_count_lock:
if self.downloaded_count >= self.max_num:
break
with self.downloaded_count_lock:
img_path = link.split('/')[-1]
if os.path.exists(os.path.join(self.save_dir,img_path)):
continue
if self.download(link):
with self.downloaded_count_lock:
self.downloaded_count += 1
def crawl(self):
threads = []
for page_num in range(1,self.max_page+1):
t=threading.Thread(target=self.worker,args=(page_num,))
threads.append(t)
t.start()
for t in threads:
t.join()
# print('单线程')
# BaseCrawler().crawl("https://search.dangdang.com/?key=%CA%E9%B0%FC&category_id=10009684&page_index=1#J_tab")
print('多线程')
MultiCrawler().crawl()
结果:

这里选择当当网爬取,实现单线程或多线程下载,为了下载速度就没有对下载程序上锁,进程数多了可能会超数量
心得
学会了如何多线程爬取
链接:https://gitee.com/wsxxs233/data-collection/tree/master/task3/blog/q1
作业2
代码
spider
import json
import time
from scrapy_selenium import SeleniumRequest
import scrapy
from stocks.items import StocksItem
class StocksSpider(scrapy.Spider):
name = "stocks"
def start_requests(self):
url = 'https://quote.eastmoney.com/center/hszs.html'
yield SeleniumRequest(url=url, callback=self.parse,wait_time=30 )
def parse(self, response):
table=response.xpath('//table[@class="quotetable_m"]/tbody')
tr=table[0].xpath('.//tr')
for t in tr:
item= StocksItem()
for i,name in enumerate(['id','code','stock_name','last_price','change_percent','change','volume','amplitude','high','low','open','close']):
item[name]=t.xpath('.//text()').extract()[i]
yield item
用于访问网页并用xpath解析内容
middleware
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self, driver_name, driver_executable_path, driver_arguments,
browser_executable_path):
"""Initialize the selenium webdriver
Parameters
----------
driver_name: str
The selenium ``WebDriver`` to use
driver_executable_path: str
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
"""
webdriver_base_path = f'selenium.webdriver.{driver_name}'
driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
driver_klass = getattr(driver_klass_module, 'WebDriver')
driver_options_module = import_module(f'{webdriver_base_path}.options')
driver_options_klass = getattr(driver_options_module, 'Options')
driver_service_module= import_module(f'{webdriver_base_path}.service')
driver_service_klass = getattr(driver_service_module, 'Service')
driver_options = driver_options_klass()
if browser_executable_path:
driver_options.binary_location = browser_executable_path
for argument in driver_arguments:
driver_options.add_argument(argument)
driver_service = driver_service_klass()
if driver_executable_path:
driver_service.executable_path=driver_executable_path
driver_kwargs = {
'options': driver_options,
'service': driver_service
}
# options = options, service = service
self.driver = driver_klass(**driver_kwargs)
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
if not driver_name or not driver_executable_path:
raise NotConfigured(
'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set'
)
middleware = cls(
driver_name=driver_name,
driver_executable_path=driver_executable_path,
driver_arguments=driver_arguments,
browser_executable_path=browser_executable_path
)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return None
self.driver.get(request.url)
try:
print('等待页面加载')
# 等待特定元素加载完成
WebDriverWait(self.driver, 15).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# 等待AJAX加载完成
WebDriverWait(self.driver, 15).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
# 等待关键内容加载
WebDriverWait(self.driver, 20).until(
EC.presence_of_element_located((By.CLASS_NAME, "main-content"))
)
print('加载完成')
except Exception as e:
spider.logger.warning(f"等待超时: {request.url}, 错误: {str(e)}")
for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
if request.script:
self.driver.execute_script(request.script)
body = str.encode(self.driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
return HtmlResponse(
self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
self.driver.quit()
从scrapy_selenium里搬运的中间件(scrapy_selenium.middlewares.SeleniumMiddleware)。因为版本的原因scrapy_selenium库不是很适配新版的selenium,主要体现在webdriver的调用,新版webdriver移除了executable_path,使用option作为参数,这里做了适当修改。或者直接使用scrapy_selenium_addon的中间件。
完整代码请见码云仓库
启动程序:run.py
结果:

心得
学会了scrapy中间件的使用和selenium库的使用及相关驱动的配置。
链接:https://gitee.com/wsxxs233/data-collection/tree/master/task3/blog/q2
作业3
代码
展开
import json
import time
import scrapy
from exchange.items import ExchangeItem
class ExchangeSpider(scrapy.Spider):
name = "exchange"
def start_requests(self):
url = 'https://www.boc.cn/sourcedb/whpj/'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
table=response.xpath('//table[@cellpadding="0" and @align="left"]')[0]
tr=table.xpath('.//tr[position()>1]')
for t in tr:
if len(t.xpath('.//td')) == 8:
item = ExchangeItem()
item['Currency']=t.xpath('.//td')[0].xpath('./text()').extract_first()
item['TBP']=t.xpath('.//td')[1].xpath('./text()').extract_first()
item['CBP']=t.xpath('.//td')[2].xpath('./text()').extract_first()
item['TSP']=t.xpath('.//td')[3].xpath('./text()').extract_first()
item['CSP']=t.xpath('.//td')[4].xpath('./text()').extract_first()
item['Time']=t.xpath('.//td')[7].xpath('./text()').extract_first()
yield item
结果

心得
学习了如何xpath提取内容和mysql存储
链接:https://gitee.com/wsxxs233/data-collection/tree/master/task3/blog/q3
浙公网安备 33010602011771号