小爬虫项目

import requests import csv import re from tqdm import tqdm from urllib.parse import urlencode from requests.exceptions import RequestException def get_one_page(city, keyword, region, page): '''获取网页html内容并返回''' paras = { 'ct': city, # 搜索城市 'kw': keyword, # 搜索关键字 'isafv': 0, # 是否打开更详细的搜索 'isfilter': 1, # 是否对结果过滤 'p': page, # 页数 're': region, # 地区2005代表海淀 } headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Host':'sou.zhaopin.com', 'Referer':'https://www.zhaopin.com/', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9', } url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?'+urlencode(paras) try: # 获取网页内容并返回html数据 response = requests.get(url, headers=headers) # 通过状态码验证是否获取成功 if response.status_code == 200: print(response.content) return response.text return None except RequestException as e: return None def parse_one_page(html): '''解析HTML代码,提取有用信息,并返回''' # 正则表达式解析 pattern = ’自己写正则' # 匹配所有符合条件的内容 items = re.findall(pattern, html) print(11111111111111111111111,items) for item in items: job_name = item[0] job_name = job_name.replace('<b>', '') job_name = job_name.replace('</b>', '') yield{ 'job': job_name, 'website': item[1], 'company': item[2], 'salary': item[3] } def write_csv_file(path, headers, rows): '''将表头和行写入csv文件''' # 加encoding防止中文报错 # newline参数防止每写入一行都多一个空行 with open(path, 'a', encoding='gb18030', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writeheader() f_csv.writerows(rows) def write_csv_headers(path, headers): '''写入表头''' with open(path, 'a', encoding='gb18030', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writeheader() def wtite_csv_rows(path, headers, rows): '''写入行''' with open(path, 'a', encoding='gb18030', newline='') as f: f_csv = csv.DictWriter(f, headers) f_csv.writerows(rows) def main(city, keyword, region, pages): '''函数''' filename = 'd:/'+'zl_' + city + '_' + keyword + '.csv' headers = ['job', 'website', 'company', 'salary'] write_csv_headers(filename, headers) for i in tqdm(range(pages)): '''获取该页面所有职位信息,写入csv文件''' jobs = [] html = get_one_page(city, keyword, region, i) items = parse_one_page(html) print(2222222222222222222,items) for item in items: jobs.append(item) write_csv_file(filename, headers, jobs) print(33333333333333333333333333333,jobs) if __name__ == '__main__': main('北京', '爬虫工程师', 2005, 10)

import os import time import threading from multiprocessing import Pool, cpu_count import requests from bs4 import BeautifulSoup headers = { 'X-Requested-With':'XMLHttpRequest', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/56.0.2924.87 Safari/537.36', 'Referer':"http://www.mmjpg.com"} dir_path = 'E:\lianggege' def save_pic(pic_src, pic_cnt): '''将图片下载到本地文件''' try: img = requests.get(pic_src, headers=headers, timeout=10) imgname = 'pic_cnt_{}.jpg'.format(pic_cnt+1) with open(imgname, 'ab') as f: f.write(img.content) print(imgname) except Exception as e: print(e) def make_dir(folder_name): '''新建套图文件夹并切换到该目录下''' path = os.path.join(dir_path, folder_name) # 如果目录中已经存在就不用再次爬取了,去重,提高效率,存在返回False,否则反之 if not os.path.exists(path): os.makedirs(path) print(path) os.chdir(path) return True print('Folder has existed!') return False def delete_empty_dir(dir): '''如果程序半路中断的话,可能存在已经建好文件夹但是仍然没 有下载的图片的情况,但此时文件夹已经存在所以忽略该套图的下载 ,此时要删除空文件夹''' if os.path.exists(dir): if os.path.isdir(dir): for d in os.listdir(dir): path = os.path.join(dir, d) # 组装下一级地址 if os.path.isdir(path): delete_empty_dir(path) # 删除空文件夹 if not os.listdir(dir): os.rmdir(dir) print('remove the empty dir:{}'.format(dir)) else: print('please start your performance!') # 请开始你的表演 lock = threading.Lock() # 全局资源锁 def urls_crawler(url): '''爬虫入口,主要爬取操作''' try: r = requests.get(url, headers=headers, timeout=10) # 套图名,也作为文件夹名 folder_name = BeautifulSoup(r, 'lxml').find('h2').text.encode('ISO-8859-1').decode('utf-8') with lock: if make_dir(folder_name): # 套图张数 max_count = BeautifulSoup(r, 'lxml').find('div', _class='page').find_all('a')[-2].get_text() # 套图页面 page_urls = [url+'/'+str(i) for i in range(1, int(max_count)+1)] # 图片地址 img_urls = [] for index, page_url in enumerate(page_urls): result = requests.get(page_url, headers=headers, timeout=10).text # 最后一张图片没有a标签直接就是img所以分开解析 if index+1 < len(page_urls): img_u = BeautifulSoup(result, 'lxml').find('div', _class='content').find('a').img['src'] img_urls.append(img_u) else: img_ur = BeautifulSoup(result, 'lxml').find('div',_class='content').find('img')['src'] img_urls.append(img_ur) for cnt, url in enumerate(img_urls): save_pic(url, cnt) except Exception as e: print(e) if __name__ == '__main__': urls = ['http://mmjpg.com/mm/{cnt}'.format(cnt=cnt) for cnt in range(1,953)] pool = Pool(processes=cpu_count()) try: delete_empty_dir(dir_path) pool.map(urls_crawler, urls) except Exception as e: time.sleep(30) delete_empty_dir(dir_path) pool.map(urls_crawler, urls)

import urllib.request import re ''' 1.获取主页源码 2.获取章节链接 3.获取章节链接源码 4.获取小说内容 5.下载,文件操作 ''' # 获取文章内容 def getNoverContent(): html = urllib.request.urlopen('http://www.quanshuwang.com/book/0/269').read() html = html.decode('gbk') reg = r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>' reg = re.compile(reg) urls = reg.findall(html) for url in urls: #章节的url地址 novel_url = url[0] # 章节的标题 novel_title = url[1] chapt = urllib.request.urlopen(novel_url).read() chapt_html = chapt.decode('gbk') reg = r'</script> ' \ '(.*?)<script type="text/javascript">' # S代表多行 reg = re.compile(reg, re.S) chapt_content = reg.findall(chapt_html) chapt_content = chapt_content[0].replace(' ', '') chapt_content = chapt_content.replace('<br /', '') print('正在保存 %s'%novel_title) with open('{}.txt'.format(novel_title), 'w') as f: f.write(chapt_content) getNoverContent()

Python的List功能已经足够完成队列的功能, 可以用 append() 来向队尾添加元素, 可以用类似数组的方式来获取队首元素, 可以用 pop(0) 来弹出队首元素. 但是List用来完成队列功能其实是低效率的, 因为List在队首使用 pop(0) 和 insert() 都是效率比较低的, Python官方建议使用collection.deque来高效的完成队列任务. collections 模块的用法 from collections import deque queue = deque(['s','d','f']) queue一定是列表数据类型 queue.append('c') queue.pop() queue.popleft() 创建一个set可以用 set() 函数或者花括号 {} . 但是创建一个空集是不能使用一个花括号的, 只能用 set() 函数. 因为一个空的花括号创建的是一个字典数据结构. print(set('asd'))====>{'a','s','d'} a-b a含B不含 a | d a或b都包含的元素 a&b a含且b含 a^b 不同时包含 import re import urllib.request import urllib from collections import deque queue = deque() visited = set() url = 'http://news.dbanotes.net' queue.append(url) cnt = 0 while queue: url = queue.popleft() visited = {url} print('已抓取'+str(cnt)+'正在抓取《----'+url) cnt += 1 urlop = urllib.request.urlopen(url) if 'html' not in urlop.getheader('Content-Type'): continue try: data = urlop.read().decode('utf-8') except: continue linkre = re.compile('href=\"(.+?)\"') for x in linkre.findall(data): if 'http' in x and x not in visited: queue.append(x) print('加入队列---》'+x)

# coding = utf-8 # author YongGuang Li by 2018/05/24 import urllib.request import re import requests import json from selenium import webdriver # 1.0版本还只是针对mv的链接下载,对于音频的下载其实也大同小异,改一下正则表达式就好了. # 通过输入的url 获取动态生成后的页面html 代码 def getHtml(url): driver = webdriver.PhantomJS() driver.get(url) page = driver.page_source # print(page) return page # 在html 代码里面通过正则匹配获取到视频的下载链接 def getVideo(html, path): reg = 'src="(.+?)"></video>' # 正则匹配 拿到vedio 标签内的下载链接 vediore = re.compile(reg) vedios = re.findall(vediore, html) x = 1 for video in vedios: video = video.replace('&', ';') # 获取的到的video 下载链接所有的'&'变成了';' 本人还不清楚原因,不过转换一下就好了 print("正在下载:%s" % video) urllib.request.urlretrieve(video, path + '/%d.mp4' % x) print("已下载完成:%s" % video) x = x + 1 yield video # write 仅仅用来测试,作用就是将链接写到文件里 def write(html): with open("d:/a.txt", 'w') as f: for h in html: f.write(h) if __name__ == '__main__': # 要是在pycharm 环境下输入链接的话加个空格再回车,不然他就默认是打开链接了 url = input("请输入您需要下载的歌曲链接:") path = input("请输入您要保存视频的位置:") html = getHtml(url) print(getVideo(html, path)) write(getVideo(html, path))

import mechanicalsoup with open('d:/微信公众号“python小屋文章”清单.txt') as f: articles = f.readlines() articles = tuple(map(str.strip, articles)) #模拟打开指定网址,模拟输入并提交输入的关键字 browser = mechanicalsoup.StatefulBrowser() browser.open(r'http://www.baidu.com') browser.select_form('#form') browser['wd'] = 'python小屋' browser.submit_selected() # 获取百度前10页 top10Urls = [] for link in browser.get_current_page().select('a'): if link.text in tuple(map(str, range(2, 11))): top10Urls.append(r'http://www.baidu.com'+link.attrs['href']) #与微信公众号里的文章标题进行对比,如果非常相似就返回True def check(text): for article in articles: # 这里使用切片,是因为有的网站在转发公众号文章里标题不完整 # 例如把’使用python+pillow绘制矩阵盖尔圆’的前两个字‘使用’给漏掉了 if article[2:-2].lower() in text.lower(): return True return False #只输出密切相关的链接 def getLinks(): for link in browser.get_current_page().select('a'): text = link.text if 'python小屋' in text or '懂付国' in text or check(text): print(link.text,'------->',link.attrs['href']) # 输出第一页 getLinks() # 处理后面的9页 for url in top10Urls: browser.open(url) getLinks()