[lottery anayliser]lottery anayliser

抓取网页，获得获奖信息

#!/usr/bin/python
                                                                                                                                                                                                                                              
import urllib2
import re
import time


def spider(url):
    """parse the website"""
    fq = urllib2.urlopen(url)
    fq_cts = fq.read()
    #phase date
    fq_phase = re.findall("<span id=\"jq_short_openTime\">\d\d-\d\d</span>", fq_cts)
    if len(fq_phase) == 1:
        fq_phase_md = re.search("(\d\d-\d\d)",fq_phase[0])
        fq_phase_ymd = "%d-" % time.localtime().tm_year + fq_phase_md.group(0)
    else:
        print "find %d fq_phase, fq_phase: %r" % (len(fq_phase),fq_phase)

    #number list
    fq_result = re.findall("div id=\"jq_openResult\".*?\/div>", fq_cts, re.S)
    if len(fq_result) == 1:
        fq_list = re.findall("\d\d", fq_result[0])
        fq_num = [int(x) for x in fq_list]
    else:
        print "find %d number_list, numberlist: %r" %(len(fq_result), fq_list)

    #phase issue
    fq_issue = re.findall("<select id=\"jq_last10_issue_no\".*?</option>", fq_cts, re.S)
    if len(fq_issue) == 1:
        fq_issue_number = re.search("(\d\d\d\d\d)", fq_issue[0]).group(0)
    else:
        print "find %d fq_issue, fq_issue: %r" % (len(fq_issue), fq_issue)

    if len(fq_phase) == 1 and len(fq_result) == 1 and len(fq_issue) == 1:
        return [fq_phase_ymd, fq_num, fq_issue_number]

#将上面产生的数据写入数据库

def tosql(numlist):
    host = '10.192.0.5'
    user = 'root'
    password = 'fengmao'
    port = '3306'
    database = 'cp'
    db = MySQLdb.connect(host,user,password,database)
    cursor = db.cursor()
    sql_insert = "insert into cp.bingo ( phase,releasedate,r1,r2,r3,r4,r5,b1,b2) \
                  values (\"%s\",\"%s\",%s,%s,%s,%s,%s,%s,%s);"                  \
                 % (numlist[2], numlist[0], numlist[1][0], numlist[1][1], numlist[1][2], numlist[1][3], numlist[1][4], numlist[1][5], numlist[1][6])
    cursor.execute('select count(*) from cp.bingo where phase=\'%s\'' % (numlist[2]))
    (check_num,) = cursor.fetchone()
    print str(check_num)
    if int(check_num) == 0:
        cursor.execute(sql_insert)
        db.commit()
        print "update successfully!"
    else:
        print "this issue is in database, no action needed!"




url = "http://sina.aicai.com/kaijiang/tcdlt/"
data_receive = spider(url)
print data_receive

扩展知识点：

a. 格式化输出，我知道在输出字符串的时候，使用%d, %s, %f 分别是输出int，string，float值，但对于原样输出list或者dict，使用%r,可以实现。

b.re ， findall返回的是匹配的字符串，而search返回 MatchObject 的实例，如果没有找到匹配的位置，则返回 None，可以对search使用分组，re.search("(xxx)").group(0)，第一个括号匹配中的是group(0)，groups返回所有匹配的组（多个括号情况下）

c. re，search与findall的区别请看：http://www.crifan.com/python_re_search_vs_re_findall/

d. re的跨行匹配，findall(reg, str, re.S)， re.M , re.S

　　re.M 在这种情况下，^字符不仅仅匹配string的开头，还匹配string中每一行的开头

　　re.S 这种情况下，点号匹配任意字符，但是没有该符号的时候，点号匹配除新行外的所有字符。（也即多行匹配）

导入历史数据到数据库

#!/usr/bin/python                                                                                                                                                                                                                             

import MySQLdb
host = '10.192.0.5'
user = 'root'
password = 'fxxx'
port = '3306'
database = 'cp'
db = MySQLdb.connect(host,user,password,database)
cursor = db.cursor()

path = '../hist_2007-01-01_2014-03-24.txt'
histd = open(path)

for line in histd:
    sql = "insert into cp.bingo ( phase,releasedate,r1,r2,r3,r4,r5,b1,b2) \
            values (\"%s\",\"%s\",%s,%s,%s,%s,%s,%s,%s);" %(line.split()[0],line.split()[2], line.split()[1].split(',')[0],\
            line.split()[1].split(',')[1],\
            line.split()[1].split(',')[2],\
            line.split()[1].split(',')[3],\
            line.split()[1].split(',')[4],\
            line.split()[1].split(',')[5],\
            line.split()[1].split(',')[6],\
            )   
    print sql 
    try:
        cursor.execute(sql)
        db.commit()
    except:
        db.rollback()
db.close()

创建数据库的脚本

#name: bingo.sql
create database cp;
use cp;
create table bingo (phase char(8) not null,
        releasedate date,
        r1 int, r2 int, r3 int, r4 int, r5 int, b1 int, b2 int,
        index_id int not null primary key auto_increment);
alter table bingo auto_increment=1;

posted @ 2014-03-20 15:55 silence.li 阅读(236) 评论(0) 收藏举报

刷新页面返回顶部

silence.li

[lottery anayliser]lottery anayliser

抓取网页，获得获奖信息

导入历史数据到数据库

公告