[lottery anayliser]lottery anayliser

-
抓取网页,获得获奖信息
#!/usr/bin/python import urllib2 import re import time def spider(url): """parse the website""" fq = urllib2.urlopen(url) fq_cts = fq.read() #phase date fq_phase = re.findall("<span id=\"jq_short_openTime\">\d\d-\d\d</span>", fq_cts) if len(fq_phase) == 1: fq_phase_md = re.search("(\d\d-\d\d)",fq_phase[0]) fq_phase_ymd = "%d-" % time.localtime().tm_year + fq_phase_md.group(0) else: print "find %d fq_phase, fq_phase: %r" % (len(fq_phase),fq_phase) #number list fq_result = re.findall("div id=\"jq_openResult\".*?\/div>", fq_cts, re.S) if len(fq_result) == 1: fq_list = re.findall("\d\d", fq_result[0]) fq_num = [int(x) for x in fq_list] else: print "find %d number_list, numberlist: %r" %(len(fq_result), fq_list) #phase issue fq_issue = re.findall("<select id=\"jq_last10_issue_no\".*?</option>", fq_cts, re.S) if len(fq_issue) == 1: fq_issue_number = re.search("(\d\d\d\d\d)", fq_issue[0]).group(0) else: print "find %d fq_issue, fq_issue: %r" % (len(fq_issue), fq_issue) if len(fq_phase) == 1 and len(fq_result) == 1 and len(fq_issue) == 1: return [fq_phase_ymd, fq_num, fq_issue_number]
#将上面产生的数据写入数据库
def tosql(numlist): host = '10.192.0.5' user = 'root' password = 'fengmao' port = '3306' database = 'cp' db = MySQLdb.connect(host,user,password,database) cursor = db.cursor() sql_insert = "insert into cp.bingo ( phase,releasedate,r1,r2,r3,r4,r5,b1,b2) \ values (\"%s\",\"%s\",%s,%s,%s,%s,%s,%s,%s);" \ % (numlist[2], numlist[0], numlist[1][0], numlist[1][1], numlist[1][2], numlist[1][3], numlist[1][4], numlist[1][5], numlist[1][6]) cursor.execute('select count(*) from cp.bingo where phase=\'%s\'' % (numlist[2])) (check_num,) = cursor.fetchone() print str(check_num) if int(check_num) == 0: cursor.execute(sql_insert) db.commit() print "update successfully!" else: print "this issue is in database, no action needed!"
url = "http://sina.aicai.com/kaijiang/tcdlt/" data_receive = spider(url) print data_receive
扩展知识点:
a. 格式化输出,我知道在输出字符串的时候,使用%d, %s, %f 分别是输出int,string,float值,但对于原样输出list或者dict,使用%r,可以实现。
b.re , findall返回的是匹配的字符串,而search返回 MatchObject 的实例,如果没有找到匹配的位置,则返回 None,可以对search使用分组,re.search("(xxx)").group(0),第一个括号匹配中的是group(0),groups返回所有匹配的组(多个括号情况下)
c. re,search与findall的区别请看:http://www.crifan.com/python_re_search_vs_re_findall/
d. re的跨行匹配,findall(reg, str, re.S), re.M , re.S
re.M 在这种情况下,^字符不仅仅匹配string的开头,还匹配string中每一行的开头
re.S 这种情况下,点号匹配任意字符,但是没有该符号的时候,点号匹配除新行外的所有字符。(也即多行匹配)
-
导入历史数据到数据库
#!/usr/bin/python import MySQLdb host = '10.192.0.5' user = 'root' password = 'fxxx' port = '3306' database = 'cp' db = MySQLdb.connect(host,user,password,database) cursor = db.cursor() path = '../hist_2007-01-01_2014-03-24.txt' histd = open(path) for line in histd: sql = "insert into cp.bingo ( phase,releasedate,r1,r2,r3,r4,r5,b1,b2) \ values (\"%s\",\"%s\",%s,%s,%s,%s,%s,%s,%s);" %(line.split()[0],line.split()[2], line.split()[1].split(',')[0],\ line.split()[1].split(',')[1],\ line.split()[1].split(',')[2],\ line.split()[1].split(',')[3],\ line.split()[1].split(',')[4],\ line.split()[1].split(',')[5],\ line.split()[1].split(',')[6],\ ) print sql try: cursor.execute(sql) db.commit() except: db.rollback() db.close()
创建数据库的脚本
#name: bingo.sql
create database cp; use cp; create table bingo (phase char(8) not null, releasedate date, r1 int, r2 int, r3 int, r4 int, r5 int, b1 int, b2 int, index_id int not null primary key auto_increment); alter table bingo auto_increment=1;

浙公网安备 33010602011771号