白嫖爬虫了,代码可用,换成自己想要爬的网站再改一下网站解析那块代码
前言:大大小小的电商网站爬了不少。结论就是分两种类型:
第一:requests 直接获取
第二:网页动态加载,requests获取失败
直接分享代码吧
1.先导入需要的库和chromedriver的地址(爬动态加载的网页需要,若是requests可直接获取的网站可忽略)
import time,re,pandas as pd,os,requests from selenium import webdriver from bs4 import BeautifulSoup CHROME_DRIVER_PATH = '/Users/xxxx/PycharmProjects/爬虫/chromedriver'
2.我先给出主函数,里面方法我会在下面贴出来
我爬的是电商网站,自然是爬去列表页的商品信息(商品描述,商品链接,商品售价,商品原价)
那么下面是爬静态网页的核心函数
#处理静态网页的
def dealSoup(now_soup,cate_name,cate_url,now_page_num):
#获取有层级的分类
cate_span_tag_list = now_soup.select('.category-breadcrumb li ')
cate_all_text = ''
for span_tag in cate_span_tag_list:
cate_all_text += f"{span_tag.text.strip()}"
#获得页数
total_page_num = 1
total_num_tag_list = now_soup.select('.site-pager li')
if len(total_num_tag_list) == 0:
pass
elif len(total_num_tag_list) == 1:
total_num_tag = total_num_tag_list[1]
total_num = extractNum(total_num_tag.text)
print(int(total_num))
total_page_num = int(total_num)
else:
total_num_tag = total_num_tag_list[-2]
total_num = extractNum(total_num_tag.text)
print(int(total_num))
total_page_num = int(total_num)
#遍历全部商品
tag_list = now_soup.select('.category-list div.item')
if len(tag_list) > 0:
print(len(tag_list))
item_list = []
for tag in tag_list:
item = {
'cate_name_all' : cate_all_text[:-1],
'cate_name' : cate_name,
'cate_url' : cate_url,
'product_now_price' : 'null',
'product_old_price' : 'null'
}
desc_tag = tag.select('.name > a')[0]
price_tag_list = tag.select('.my-shop-price')
item['product_desc'] = desc_tag.text.strip()
item['product_link'] = desc_tag.attrs['href']
if len(price_tag_list) > 0:
item['product_now_price'] = price_tag_list[0].attrs['data-oprice']
item['product_old_price'] = price_tag_list[0].attrs['data-oprice']
if len(price_tag_list) > 1:
item['product_old_price'] = price_tag_list[1].attrs['data-oprice']
print(item)
item_list.append(item)
objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{now_page_num}.xlsx")
return True,total_page_num
else:
return False,total_page_num
if __name__ == "__main__":
#需要爬去的列表页链接
#cate_url:列表页url
#cate_name:你对这个列表页的分类定义
ALL_CATE_LIST = [
{'cate_url': 'https://www.adorawe.net/category/denim-pants-c_808.html',
'cate_name': 'Pants1'},
{'cate_url': 'https://www.adorawe.net/category/casual-pants-c_809.html',
'cate_name': 'Pants'},
]
#设置一个文件加用来存爬取的信息
save_dir = '/Users/xxxx/Desktop/adorawe'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
#开始爬列表页
for cate_obj in ALL_CATE_LIST:
#获得BeautifulSoup格式的网页文件
soup = get_static_html(cate_obj['cate_url'])
#处理网页,保存本页商品数据,获得该列表页的总页数
go_status,page_num = dealSoup(soup, cate_obj['cate_name'], cate_obj['cate_url'], 1)
#翻页爬取
for i in range(1,page_num):
body_url = cate_obj['cate_url'].replace('.html','')
tmp_url = f"{body_url}-page-{i+1}.html"
tmp_soup = get_static_html(tmp_url)
go_status, page_num = dealSoup(tmp_soup, cate_obj['cate_name'], tmp_url, i+1)
#因为是每页的商品数据单独保存,,所以需要合并成一个
connectToOne(save_dir, '/Users/xxx/Desktop', 'adorawe.xlsx')
下面是爬 动态网页的
#处理动态加载网页的
def dealSoup(driver,cate_name,cate_url,page_num):
now_data = driver.page_source
now_soup = BeautifulSoup(now_data, 'html.parser')
#获取有层级的分类
cate_span_tag_list = now_soup.select('ul.breadcrumb > li')
cate_all_text = ''
for cate_span in cate_span_tag_list:
cate_all_text += f"{cate_span.text.strip()}/"
#遍历全部商品
tag_list = now_soup.select('div.product-list-container > .product-item')
if len(tag_list) > 0:
print(len(tag_list))
item_list = []
for tag in tag_list:
item = {
'cate_name_all' : cate_all_text[:-1],
'cate_name' : cate_name,
'cate_url' : cate_url,
'product_now_price' : 'null',
'product_old_price' : 'null'
}
desc_tag = tag.select('.product-item-name')[0]
link_tag = desc_tag.select('a')[-1]
final_price_tag_list = tag.select('.product-item-final-price-js')
del_price_tag_list = tag.select('.product-item-del-price-js')
item['product_desc'] = desc_tag.text.strip()
item['product_link'] = link_tag.attrs['href']
if len(final_price_tag_list) > 0:
item['product_now_price'] = final_price_tag_list[0].text.strip()
item['product_old_price'] = final_price_tag_list[0].text.strip()
if len(del_price_tag_list) > 0:
item['product_old_price'] = del_price_tag_list[0].text.strip()
print(item)
item_list.append(item)
objListToExcel(item_list,heads_0,f"{save_dir}/{cate_name}_{page_num}.xlsx")
return True
else:
return False
if __name__ == "__main__":
# 需要爬去的列表页链接
# cate_url:列表页url
# cate_name:你对这个列表页的分类定义
# total_page:这个列表页的总页数
ALL_CATE_LIST = [
{'cate_url': 'https://sea.newchic.com/pajamas-and-robes-c-4185/?country=188&SEA=0',
'cate_name': 'Loungewear',
'total_page': 9},
{'cate_url': 'https://sea.newchic.com/womens-shoes-c-3592/?country=188&SEA=0',
'cate_name': 'Shoes',
'total_page': 62 },
]
#设置一个文件加用来存爬取的信息
save_dir = '/Users/xxx/Desktop/newchic'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
#模拟浏览器打开网页
site_url_0 = ALL_CATE_LIST[0]['cate_url']
print('开始加载', site_url_0, '动态页面')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options)
driver.set_page_load_timeout(100)
driver.set_window_size(1420, 780)
driver.get(site_url_0)
#由于是懒加载,需要模拟滚动屏幕,是页面加载全部的商品
#第二个参数是滚动距离,根据爬取的页面调整大小,使得商品全部加载即可
fullpage_screenshot(driver,10000)
time.sleep(5)
#处理该页面,并存储到本地
dealSoup(driver, ALL_CATE_LIST[0]['cate_name'], ALL_CATE_LIST[0]['cate_url'], 0)
#开始爬列表页
for cate_obj in ALL_CATE_LIST:
driver.get(cate_obj['cate_url'])
fullpage_screenshot(driver, 10000)
dealSoup(driver, cate_obj['cate_name'], cate_obj['cate_url'], 0)
#判断是否可以翻页
go_status = True
for i in range(1,cate_obj['total_page']):
if go_status:
next_page_tag_list = driver.find_elements_by_css_selector('.page-item-next')
if len(next_page_tag_list) > 0:
next_page_tag_list[0].click()
time.sleep(3)
fullpage_screenshot(driver, 6000)
go_status = dealSoup(driver,cate_obj['cate_name'],cate_obj['cate_url'],i)
else:
go_status = False
time.sleep(10)
driver.quit()
# 因为是每页的商品数据单独保存,,所以需要合并成一个
connectToOne(save_dir, '/Users/xxx/Desktop', 'newchic.xlsx')
最后:
用到的其他方法,我就一次性粘贴了:
# 模拟滚动
def fullpage_screenshot(driver, total_height):
total_width = driver.execute_script("return document.body.offsetWidth")
# total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
# total_height = 50000
viewport_width = driver.execute_script("return document.body.clientWidth")
viewport_height = driver.execute_script("return window.innerHeight")
rectangles = []
i = 0
while i < total_height:
ii = 0
top_height = i + viewport_height
if top_height > total_height:
top_height = total_height
while ii < total_width:
top_width = ii + viewport_width
if top_width > total_width:
top_width = total_width
rectangles.append((ii, i, top_width, top_height))
ii = ii + viewport_width
i = i + viewport_height
previous = None
part = 0
for rectangle in rectangles:
if not previous is None:
driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
time.sleep(0.2)
file_name = "part_{0}.png".format(part)
# driver.get_screenshot_as_file(file_name)
if rectangle[1] + viewport_height > total_height:
offset = (rectangle[0], total_height - viewport_height)
else:
offset = (rectangle[0], rectangle[1])
part = part + 1
previous = rectangle
return True
heads_0 = ['cate_name_all','cate_name', 'cate_url', 'product_link', 'product_desc','product_now_price','product_old_price']
def objListToExcel(objlist,column_arr,out_path):
df_data_source = {}
for filed in column_arr:
df_data_source[filed] = []
if len(objlist) == 0:
return 0
for obj in objlist:
for key_0 in column_arr:
df_data_source[key_0].append(obj[key_0])
df_data = pd.DataFrame(df_data_source,columns=column_arr)
df_data.to_excel(out_path,index=False)
def extractPriceNum(price_str):
# 价格正则
price_pattern = re.compile(r'[0-9]+.[0-9]{2}')
price_num_arr = re.findall(price_pattern,price_str)
if len(price_num_arr) > 0:
return price_num_arr[0]
else:
return 'null'
def extractNum(test_str):
# 价格正则
price_pattern = re.compile(r'[0-9]+')
num_arr = re.findall(price_pattern,test_str)
if len(num_arr) > 0:
return int(num_arr[0])
else:
return 1
def connectToOne(dir, to_dir, out_file_name):
excel_list = []
for file in os.listdir(dir):
if file.endswith('.xlsx') and '.~' not in file :
print("file:", file)
excel_list.append(
pd.read_excel(os.path.join(dir, file), dtype={'cate_url': str, 'product_link': str}, ))
print('开始合并')
total_excel = pd.concat(excel_list)
print('生成文件')
writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter',
options={'strings_to_urls': False})
print(os.path.join(to_dir, out_file_name), writer)
total_excel.to_excel(writer, index=False)
writer.close()
————————————————
版权声明:本文为CSDN博主「blues_phone」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/huangmengfeng/article/details/116146346

浙公网安备 33010602011771号