正则表达式(包含与不包含)

http://www.cnblogs.com/dongzhiquan/archive/2009/12/12/1994691.html

#coding:utf-8
'''
Created on 2014年3月20日

@author: ZSH
'''
#import urllib2
#import urllib2.request
import urllib
import json
from bs4 import BeautifulSoup

def get_year_range(code):
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % code
#content = urllib.request.urlopen(url).read()
content = urllib.urlopen(url).read()
soup = BeautifulSoup(content)
str1 = soup.findAll('select', attrs={'name':'year'})
optionSoup = str1[0]
optionTags = optionSoup.findAll('option')
yearlist = []
for i in range(0, len(optionTags)):
yearlist.append(optionTags[i].string)
return (yearlist)

def get_data(code):
xx = 0
yearlist = get_year_range(code)
print "len(yearlist)", len(yearlist)

#encodejson = open(r'C:\e_disk\work\python\quat\ '+'xx.txt','w')
#for year in range(2,len(yearlist)):
for year in range(0,1):
for season in range(1,3):
try:
jidu = str(season)
if float(year) == 0 and float(jidu)>1:
continue
codestr = str(code)
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/'+codestr+'.phtml?year='+yearlist[year]+'&jidu='+jidu
#rsp = urllib.request.urlopen(url)
print url
print " season year xx", season,year, xx
xx = xx+1
rsp = urllib.urlopen(url)
html = rsp.read()
#print "html",html
soup = BeautifulSoup(html, from_encoding = 'GB2312')
#tablesoup = soup.getText()
tablesoup = soup.find_all('table', attrs = {'id':'FundHoldSharesTable'})
d1 = {}
rows = tablesoup[0].findAll('tr')
colume = rows[1].findAll('td')
print " season year xx", season,year, xx
xx = xx+1
print rows
#for row in rows[2:]:
# print "print rows,",rows[row]
return
for row in rows[2:]:
print "row,row[1]",row,rows[row]
print "\n"
data = row.findAll('td')
print "colume[0].get_text(),[]",colume[0].get_text(),[]
print "data[0].get_text(strip = True)",data[0].get_text(strip = True)
d1.setdefault(colume[0].get_text(),[]).append(data[0].get_text(strip = True))
d1.setdefault(colume[1].get_text(),[]).append(data[1].get_text(strip = True))
d1.setdefault(colume[2].get_text(),[]).append(data[2].get_text(strip = True))
d1.setdefault(colume[3].get_text(),[]).append(data[3].get_text(strip = True))
d1.setdefault(colume[4].get_text(),[]).append(data[4].get_text(strip = True))
d1.setdefault(colume[5].get_text(),[]).append(data[5].get_text(strip = True))
d1.setdefault(colume[6].get_text(),[]).append(data[6].get_text(strip = True))
print " season year xx", season,year, xx
xx = xx+1
#path = 'C:\e_disk\work\python\quat\ '+rows[0].get_text(strip = True)+yearlist[year]+r'年'+jidu+r'季度.json'
#print "path", path
#encodejson = open(r'C:\e_disk\work\python\quat\ '+rows[0].get_text(strip = True)+yearlist[year]+r'年'+jidu+r'季度.json','w')
#C:\e_disk\work\python\quat
print " season year xx", season,year, xx
xx = xx+1

#d1 = json.dumps(data1,ensure_ascii = False)
encodejson.write(json.dumps(d1,ensure_ascii = False))
print " season year xx", season,year, xx
xx = xx+1
#print('已完成'+rows[0].get_text(strip = True)+yearlist[year]+r'年'+jidu+r'季度.json')
except:
print('Error happens')
xx=0
continue
print " season year xx sucssed ", season,year, xx
xx = 0
print('completed')
encodejson.close()
#data1 = {'b':789,'c':456,'a':123}
#d1 = json.dumps(data1,sort_keys=True)

get_data(600000)

'''
著作权归作者所有。
商业转载请联系作者获得授权，非商业转载请注明出处。
作者：池云
链接：https://www.zhihu.com/question/32074399/answer/54617604
来源：知乎
''''

#获取某个页面的html源码的代码如下：
import urllib2
def getPage(pageURL):
'''
获取指定地址的html源码
'''
request = urllib2.Request(pageURL)
response = urllib2.urlopen(request)
result = response.read()
return result

print getPage(r'http://www.sse.com.cn/disclosure/listedinfo/credibility/change/')

#以上是发送邮件的代码
#coding:utf8
import smtplib
import json
from email import Encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header
from testMail import toEmail

def sendMail(fromEmail, username, password, serverAddress, subject, htmlContent, toEmail):
'''
fromEmail: 使用哪个邮箱地址发送
username: 登陆邮箱服务器的用户名，一般与fromEmail相同
password: 登陆的密码
serverAddress: 邮箱服务的地址（包含端口）
subject: 邮件主题
htmlContent: 邮件正文，使用html格式编写
toEmail: 要发送到的邮箱地址，可以写多个
'''
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
msg['From'] = fromEmail
msg['To'] = ', '.join(toEmail)
msg["Accept-Language"]="zh-CN" #指定语言环境是中文
msg["Accept-Charset"]="ISO-8859-1,utf-8" #指定使用特定的编码，防止乱码
part = MIMEText(htmlContent, 'html', 'UTF-8')
msg.attach(part)

s = smtplib.SMTP(serverAddress)
print "Try to login"
s.login(username, password)
print "login successfully, try to send"
s.sendmail(fromEmail, toEmail, msg.as_string())
print "send successfully"
s.quit()

sendMail(fromEmail='fromsomebody@163.com',
username="fromsomebody@163.com",
password="somebodyspass",
serverAddress='smtp.163.com:25',
subject='Python邮件代码测试',
htmlContent="plain text content",
toEmail= ['tosomebody@xxx.com'])

#抓取并解析上海证券交易所相关信息的代码，定时启动、检测变化以及发送邮件并没有实现
import urllib2
import time
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import re

def testMatch():
remoteoteURL = r'http://www.sse.com.cn/disclosure/listedinfo/credibility/change/'
request = urllib2.Request(remoteoteURL)
response = urllib2.urlopen(request)
result = response.read() #获取响应的HTML

#匹配表格标题的正则表达式
titlePatternStr = r'<table class="tablestyle">.*?<th>(\S*)</th>\n\s*<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>.*?<th>(\S*)</th>'

#匹配表格内容的正则表达式
valuePatternStr = r' <tr style="background-color.*?<td class="nowrap">(\d*?)</td>.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>.*?document.write\(\$.format\(\'([\.,\d]*)\'.*?document.write\(\$.format\(\'([\.,\d]*)\'.*?document.write\(\$.format\(\'([\.,\d]*)\'.*?document.write\(\$.format\(\'([\.,\d]*)\'.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>.*?<td class="nowrap">(\S*)</td>'
valuePattern = re.compile(valuePatternStr, re.S)
titlePattern = re.compile(titlePatternStr, re.S)

#匹配表格标题
titles = re.findall(titlePattern, result)

#匹配表格内容
items = re.findall(valuePattern, result)

#根据匹配的结果生成HTML表格
tableStr = "<table>"
titleStr = "<tr>"
for title in titles:
for tt in title:
titleStr += '<th>'+ str(tt)+'</th>'
titleStr += "</tr>"
tableStr += titleStr
for item in items:
tableStr += "<tr>"
for name in item:
tableStr += "<td>%s</td>" % (name)
tableStr += "</tr>"
tableStr += "</table>"
print tableStr

testMatch()

#beautiful soup grasp,
#coding:utf-8

posted on 2016-03-03 17:15 老渔阅读(2521) 评论(1) 收藏举报

刷新页面返回顶部

老渔

正则表达式(包含与不包含)

导航

公告