爬取股票信息

#目标:获取上交所和深交所所有股票的名称和交易信息
#输出:保存到文件中
#技术路线:requests-bs4-re

#候选网站选取原则:
#                    股票信息静态存在与html界面中,非js代码生成,没有robots协议限制
#选取心态:
#        不要纠结于某个网站,多找信息源进行尝试

#程序结构程序设计
#步骤1:从东方财富网获取股票列表
#步骤2:根据股票列表逐个到百度股票获取个股信息
#步骤3:将结果存储到文件中


import requests
from bs4 import BeautifulSoup
import traceback
import re

def getHtTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text 
    except:
        return ''

def getStockList(lst,stockURL):
    html = getHtTMLText(stockURL)
    soup = BeautifulSoup(html,'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']
            number = re.findall(r'[s][hz]\d{6}',href)[0]
            if number is not '':
                lst.append(number)
        except:
            continue


def getStockInfo(lst,stockURL,fpath):
      count = 0
      for stock in lst:
          url = stockURL + stock +  '.html'
          html = getHtTMLText(url)
          try:
              if html == '':
                  continue              
              infoDict = {}
              soup = BeautifulSoup(html,'html.parser')
              stockInfo = soup.find('div',attrs = {'class':'stock-bets'})
              name = stockInfo.find_all(attrs = {'class':'bets-name'})[0]
              infoDict.update({'股票名称':name.text.split()[0]})
              print('\n'+url)
              print({'股票名称':name.text.split()[0]})              
              keyList = stockInfo.find_all('dt')
              valueList = stockInfo.find_all('dd')
              for i in range(len(keyList)):
                  key = keyList[i].string.strip()
                  val = valueList[i].string.strip()
                  infoDict[key] = val
                  print('\t' + key + ':' + val)
              with open(fpath,'a',encoding = 'utf-8') as f:
                  f.write(str(infoDict)+'\n')
                  count = count + 1
                  print('\r当前进度:{:.2f}%'.format(count*100/len(lst)),end = '')
          except:
              count = count + 1
              print('\r当前进度:{:.2f}%'.format(count*100/len(lst)),end = '')
              #traceback.print_exc()
              continue
    
def main():
    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    stock_info_url = 'http://gupiao.baidu.com/stock/'
    output_file = 'D://BaiduStockInfo.txt'
    slist = []
    getStockList(slist,stock_list_url)
    getStockInfo(slist,stock_info_url,output_file)

main()

 

posted on 2018-02-05 16:37  ZhangのBlog  阅读(725)  评论(0编辑  收藏  举报