如何使用BeautifulSoup爬取网页中的需要信息?
#!/usr/bin/python #-*-coding:utf-8-*- #指定编码格式,python默认unicode编码 import json,os,sys from bs4 import BeautifulSoup from urllib.request import urlopen import re import time from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.desired_capabilities import DesiredCapabilities def first(): binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe' options = Options() options.set_headless(headless=True) options.binary = binary cap = DesiredCapabilities().FIREFOX cap["marionette"] = True #optional fp = webdriver.FirefoxProfile() fp.set_preference("permissions.default.stylesheet" ,2) fp.set_preference("permissions.default.image" ,2) driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe") return driver def readlinkfile(sourcename='tangshisanbaishou.txt'): with open(sourcename, 'rt', encoding='utf-8') as f: data = f.read() all_links = [] mylinks = re.split('\n',data) for x in mylinks: all_links.append(x) f.close() return all_links def writeTxtFile(data,outfilename): with open(outfilename+'.txt', 'wt',encoding='utf-8') as f: for m in data: f.write(m+'\n') f.close() def writeJsonFile(data,outfilename): with open(outfilename+'.json', 'wt',encoding='utf-8') as f: for m in data: json.dump(m,f,ensure_ascii=False,indent=4) f.close() def writeJsonFileAddEndFile(data,outfilename): with open(outfilename+'.json', 'a',encoding='utf-8') as f: for m in data: json.dump(m,f,ensure_ascii=False,indent=4) f.close() def action(driver,link): url = "https://****.org" +link driver.get(url) booklinks = [] elements = driver.find_elements_by_css_selector(".bookcont a") title = driver.find_element_by_css_selector("h1").text # print(elements.text) for e in elements: # print(e.get_property('href')) booklinks.append(e.get_property('href')) writeTxtFile(booklinks,'./onebooklink/'+ title) def createLinkList(): driver = first() links = readlinkfile('gujilinks.txt') for link in links: action(driver,link) # break driver.quit() # createLinkList() def soup(gushiurl): # assert gushiurl if not gushiurl: return # gushiurl = str("https://****.org" + gushiurl) print(gushiurl) html = urlopen(gushiurl).read().decode('utf-8') # print(html) soup = BeautifulSoup(html, features='lxml') contsons = soup.find_all('div', {"class": "contson"}) title = soup.find_all('h1') h1= title[0].get_text().replace('\n译注\n\n','') text = [] for item in contsons: text.append(item.get_text()) temp= { 'title':h1, 'text':text } contents = [] soup = None contents.append(temp) return contents path = "G:\\workspace\\python\\selenium\\guji\\restlinks" #文件夹目录 Files_Global = [] def file_name_walk(file_dir): for files in os.listdir(file_dir): Files_Global.append(files) # 当前路径下所有非目录子文件 def getOne(name): links = readlinkfile('./onebooklink/'+name+'.txt') for link in links: contents = soup(link) writeJsonFileAddEndFile(contents,'./gujisourse/'+name) def getOne2(name): print(name) links = readlinkfile('./restlinks/'+name) index = 0 for link in links: time.sleep(0.2) print(index) contents = soup(link) name = name.replace('.txt','') if contents: writeJsonFileAddEndFile(contents,'./gujisourse/'+name) index += 1 def run(): file_name_walk(path) for name in Files_Global: print(name) try: getOne2(name) except Exception as e: raise e # break # run()
必备技能:
- python 基础语法
- BeautifulSoup的帮助文档
- CSS选择器语法
- 简单的存读文件方法