360的搜索指数排行榜

  • 百度指数用图片处理过,太难抓取
  • 爬取代码是 python 3.x
#!/usr/bin/env python
#-*- encoding: utf-8 -*-
# refer to http://blog.csdn.net/wangtaoking1/article/details/18308635 
import http.cookiejar
from urllib import request
from urllib.parse import quote

HTTP_PROXY = '10.13.61.118:6666'

def getOpener(head, enable_proxy=False):
    # deal with the Cookies
    cj = http.cookiejar.CookieJar()
    cookie_support = request.HTTPCookieProcessor(cj)
    # deal with proxy
    debug_hander = request.HTTPHandler(debuglevel=1) # debuglevel=0
    proxy_handler = request.ProxyHandler({"http":HTTP_PROXY, "https":HTTP_PROXY})
    opener = request.build_opener(cookie_support, proxy_handler, debug_hander) \
        if enable_proxy else request.build_opener(cookie_support, debug_hander)
    # request.install_opener(opener)
    opener.addheaders = list(head.items())
    return opener
    
    
import os,json  
def main(school="江苏经贸职业技术学院"):
    header = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language':'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
    'Accept-Encoding':'gzip, deflate, sdcn',
    }
    opener = getOpener(header)        
    url = "http://index.so.com/index.php?a=overviewJson&q=%s&area=%s" % (quote(school),quote("全国"))
    data = opener.open(url).read().decode('utf-8')    
    try:
        index = json.loads(data).get('data')[0]['data']['month_index']
    except:
        index = -1
    return ('%s=%d' % (school, index if isinstance(index,int) else -1 ))

import time    
if __name__ == '__main__':
    #main()
    fp = open("index.txt",'w',encoding='utf-8')
    XX = open("school_list.txt").read().splitlines()
    for line in XX:
        time.sleep(1)    
        fp.write(main(line)+'\n')
        fp.flush()
    fp.close()    

查看排行

 cat index.txt | sort -t= -k2 -nr | less
posted @ 2016-05-11 09:37  bregman  阅读(363)  评论(0编辑  收藏  举报