# coding:utf-8
import hashlib
import datetime
import lxml
import pymysql
import requests
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def search_data(kw, n):
ll = []
res = requests.get('https://www.dogedoge.com/results?q={}'.format(kw))
if n > 1:
res = requests.get('https://www.dogedoge.com/results?q={}&p={}'.format(kw, n))
con = etree.HTML(res.text)
url = con.xpath('//div[@class="result results_links_deep highlight_d result--url-above-snippet"]')
for u in url:
title = ''
for i in u.xpath('./div/h2/a//text()'):
title += i
url = ''
for i in u.xpath('./div/div/div/a/span//text()'):
url += i
domain = ''
if url.find('http') != -1:
domain = url.split('/')[2]
else:
domain = url.split('/')[0]
md5 = hashlib.md5(url).hexdigest()
item = {}
item['keywd'] = kw
item['domain'] = domain
item['title'] = title
item['md5'] = md5
item['url'] = url
item['searcher'] = 'dogedoge'
ll.append(item)
save(ll)
try:
next = con.xpath('//div[@id="rld-2"]')
except:
print '没有下一页了'
return ''
else:
return next
def main(kw):
n = 1
while True:
next_page = search_data(kw, n)
if not next_page:
break
n += 1
def save(ll):
db = pymysql.connect(
host=MYSQL_HOST,
db=MYSQL_DBNAME,
user=MYSQL_USER,
passwd=MYSQL_PASSWD,
charset='utf8',
use_unicode=True)
cursor = db.cursor()
for item in ll:
# print type(item), item['searcher']
try:
# 插入数据库
cursor.execute(
"insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)",
(item['md5'],
item['keywd'],
item['title'],
item['url'],
datetime.datetime.now(),
item['domain'],
item['searcher']
))
# 提交sql语句
db.commit()
except Exception as error:
# 出现错误时打印错误日志
# print error
# logger.error(error)
db.rollback()
cursor.close()
db.close()
main('爬取关键词')