如何使用BeautifulSoup爬取网页中的需要信息?

#!/usr/bin/python
#-*-coding:utf-8-*-       #指定编码格式,python默认unicode编码

import json,os,sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import time

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


def first():
	binary = r'C:\Program Files\Mozilla Firefox32\firefox.exe'
	options = Options()
	options.set_headless(headless=True)
	options.binary = binary
	cap = DesiredCapabilities().FIREFOX
	cap["marionette"] = True #optional

	fp = webdriver.FirefoxProfile()
	fp.set_preference("permissions.default.stylesheet" ,2)
	fp.set_preference("permissions.default.image" ,2)

	driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:\\Program Files\\geckodriver-v0.26.0-win64\\geckodriver.exe")
	return driver

def readlinkfile(sourcename='tangshisanbaishou.txt'):
	with open(sourcename, 'rt', encoding='utf-8') as f:
		data = f.read()
		
		all_links = []
		mylinks = re.split('\n',data)

		for x in mylinks:
			all_links.append(x)
	f.close()
	return all_links

def writeTxtFile(data,outfilename):
	with open(outfilename+'.txt', 'wt',encoding='utf-8') as f:
	 	for m in data:
	 		f.write(m+'\n')
	 	f.close()

def writeJsonFile(data,outfilename):
    with open(outfilename+'.json', 'wt',encoding='utf-8') as f:
        for m in data:
            json.dump(m,f,ensure_ascii=False,indent=4)
        f.close()

def writeJsonFileAddEndFile(data,outfilename):
    with open(outfilename+'.json', 'a',encoding='utf-8') as f:
        for m in data:
            json.dump(m,f,ensure_ascii=False,indent=4)
        f.close()



def action(driver,link):
	url = "https://****.org" +link
	driver.get(url)

	booklinks = []
	elements = driver.find_elements_by_css_selector(".bookcont a")

	title = driver.find_element_by_css_selector("h1").text

	# print(elements.text)
	for e in elements:
		# print(e.get_property('href'))
		booklinks.append(e.get_property('href'))

	writeTxtFile(booklinks,'./onebooklink/'+ title)


def createLinkList():
	driver = first()
	links = readlinkfile('gujilinks.txt')

	for link in links:
		action(driver,link)
		# break
		
	driver.quit()

# createLinkList()


def soup(gushiurl):
	# assert gushiurl
	if not gushiurl:
		return
	# gushiurl = str("https://****.org" + gushiurl)
	print(gushiurl)

	html = urlopen(gushiurl).read().decode('utf-8')
	# print(html)

	soup = BeautifulSoup(html, features='lxml')

	contsons = soup.find_all('div', {"class": "contson"})
	title = soup.find_all('h1')

	h1= title[0].get_text().replace('\n译注\n\n','')
	text = []

	for item in contsons:
		text.append(item.get_text())

	temp= {
		'title':h1,
		'text':text
	}

	contents = []
	soup = None
	contents.append(temp)
	return contents


path = "G:\\workspace\\python\\selenium\\guji\\restlinks" #文件夹目录
Files_Global = []

def file_name_walk(file_dir):
    for files in os.listdir(file_dir):
        Files_Global.append(files)  # 当前路径下所有非目录子文件


def getOne(name):
	links = readlinkfile('./onebooklink/'+name+'.txt')
	for link in links:
		contents = soup(link)
		writeJsonFileAddEndFile(contents,'./gujisourse/'+name)

def getOne2(name):
	print(name)
	links = readlinkfile('./restlinks/'+name)
	index = 0
	for link in links:
		time.sleep(0.2)
		print(index)
		contents = soup(link)
		name = name.replace('.txt','')
		if contents:
			writeJsonFileAddEndFile(contents,'./gujisourse/'+name)
			index += 1

def run():

	file_name_walk(path)

	for name in Files_Global:
		print(name)
		
		try:
			getOne2(name)
		except Exception as e:
			raise e
		
	
		# break

# run()

  必备技能:

  • python 基础语法
  • BeautifulSoup的帮助文档
  • CSS选择器语法
  • 简单的存读文件方法
posted @ 2020-01-03 15:09  小西346  阅读(450)  评论(0编辑  收藏  举报