程序开发常用轮子
这是一篇转载文章
【python】MD5生成
|
1
2
3
4
5
|
import hashlibm2 = hashlib.md5()m2.update(src)print m2.hexdigest() |
【python】CSV读写
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
@@@@ 写入并生成csv文件 @@@@# coding: utf-8import csvcsvfile = file('csv_test.csv', 'wb')writer = csv.writer(csvfile)writer.writerow(['姓名', '年龄', '电话'])data = [ ('小河', '25', '1234567'), ('小芳', '18', '789456')]writer.writerows(data)csvfile.close()@@@@ 读取csv文件 @@@@# coding: utf-8import csvcsvfile = file('csv_test.csv', 'rb')reader = csv.reader(csvfile)for line in reader: print linecsvfile.close() |
【shell】shell批量添加文件后缀名
|
1
|
for i in *; do mv "$i" "$i.txt"; done |
【python】提取中文
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
#coding: utf-8import sys,rereload(sys)sys.setdefaultencoding('utf8')s = """ en: Regular expression is a powerful tool for manipulating text. zh: 汉语是世界上最优美的语言,正则表达式是一个很有用的工具 jp: 正規表現は非常に役に立つツールテキストを操作することです。 jp-char: あアいイうウえエおオ kr:정규 표현식은 매우 유용한 도구 텍스트를 조작하는 것입니다. """#unicode chineses = unicode(s)re_words = re.compile(u"[\u4e00-\u9fa5]+")m = re_words.search(s,0)print "unicode 中文"print "--------"print mprint m.group()res = re.findall(re_words, s) # 查询出所有的匹配字符串if res: print "There are %d parts:\n" % len(res) for r in res: print rprint "--------\n" |
【python】Mysql读写
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
@@@@ 写入Mysql @@@@import sys,time,os,smtplib,MySQLdbtoday = time.strftime('%Y-%m-%d',time.localtime(time.time()))db = MySQLdb.connect("localhost","root","”,”seo_data",charset="utf8")cursor = db.cursor()sql = '''INSERT INTO bd_m_uv VALUES ("%s",%s)''' % (today,','.join(sql_bd_m_uv))try: cursor.execute(sql) db.commit() print 'done'except: db.rollback()@@@@ 读取Mysql @@@@import csv,re,sysimport MySQLdb as mdbreload(sys)sys.setdefaultencoding('utf8')con = mdb.connect('localhost','root','','url_push',charset='utf8');with con: cur = con.cursor() cur.execute("select * from hx") numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() print row[0] |
【python】判断当前字符串是否全部为中文
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
#coding:utf-8import sysreload(sys)sys.setdefaultencoding('utf8')def check_contain_chinese(check_str): n = 0 m = 0 for ch in check_str.decode('utf-8'): if u'\u4e00' <= ch <= u'\u9fff': n += 1 else: m += 1 if m == 0: return 1 else: return 0 |
【python】计算正文字数
|
1
2
3
4
5
6
|
'''计算正文字数'''text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),newcontent) text2 = re.sub('<[^>]*?>','',text) words_number = len(text2) |
【python】html块去除杂乱标签
|
1
2
3
4
5
6
7
8
9
10
11
|
# text为正文变量a = re.sub(r'<script.*?>([\s\S]*?)<\/script>','',text)b = re.sub(r'<style.*?>([\s\S]*?)</style>','',a)c = re.sub(r'{[\s\S]*}','',b)d = re.sub(r'<(?!p|img|/p)[^<>]*?>','',c).strip() #将除p和img之外的标签清空,且去除正文开头结尾的换行,并把单引号换成双引号e = re.sub(r'<p[^>]*?>','<p>',d) #格式化p标签 |
【python】将反斜杠u类型(\uXXXX)的字符串,转换为对应的unicode的字符串
|
1
2
3
4
|
slashUStr = "\\u0063\\u0072\\u0069\\u0066\\u0061\\u006E\\u0020\\u5728"decodedUniChars = slashUStr.decode("unicode-escape")print "decodedUniChars=",decodedUniChars |
【python】json、dict转化
|
1
2
3
4
5
6
7
8
9
|
import simplejson# JSON转化为字典json_2_dict = simplejson.loads(user)print json_2_dict#字典转化为JSON字符串dict_2_jsonstr = simplejson.dumps(json_2_dict)print dict_2_jsonstr |
【python】通过UA识别wap还是pc来访
|
1
2
3
4
5
6
7
8
9
10
11
|
def getUA(ua): reg_b = re.compile(r"(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\\.(browser|link)|vodafone|wap|windows ce|xda|xiino", re.I|re.M) reg_v = re.compile(r"1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\\-(n|u)|c55\\/|capi|ccwa|cdm\\-|cell|chtm|cldc|cmd\\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\\-s|devi|dica|dmob|do(c|p)o|ds(12|\\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\\-|_)|g1 u|g560|gene|gf\\-5|g\\-mo|go(\\.w|od)|gr(ad|un)|haie|hcit|hd\\-(m|p|t)|hei\\-|hi(pt|ta)|hp( i|ip)|hs\\-c|ht(c(\\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\\-(20|go|ma)|i230|iac( |\\-|\\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\\/)|klon|kpt |kwc\\-|kyo(c|k)|le(no|xi)|lg( g|\\/(k|l|u)|50|54|\\-[a-w])|libw|lynx|m1\\-w|m3ga|m50\\/|ma(te|ui|xo)|mc(01|21|ca)|m\\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\\-2|po(ck|rt|se)|prox|psio|pt\\-g|qa\\-a|qc(07|12|21|32|60|\\-[2-7]|i\\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\\-|oo|p\\-)|sdk\\/|se(c(\\-|0|1)|47|mc|nd|ri)|sgh\\-|shar|sie(\\-|m)|sk\\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\\-|v\\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\\-|tdg\\-|tel(i|m)|tim\\-|t\\-mo|to(pl|sh)|ts(70|m\\-|m3|m5)|tx\\-9|up(\\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\\-|your|zeto|zte\\-", re.I|re.M) b = reg_b.search(ua) v = reg_v.search(ua[0:4]) if b or v: return 'wap' else: return 'pc' |
【linux】根据文件名杀死进程
|
1
|
ps auxf|grep 'zq2.py'|grep -v grep|awk '{print $2}'|xargs kill -9 |
【python】日期遍历
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
import datatime,timedef date_range(start, end, only_monday=False, input_format='%y%m%d', output_format='%y%m%d'): '''如print date_range(140130, 140202) 输出['140130', '140131', '140201', '140202'] ''' start = str(start) end = str(end) start = datetime.datetime.strptime(start, input_format) end = datetime.datetime.strptime(end, input_format) one_day = datetime.timedelta(days=1) range_ = [] d = start - one_day while 1: d = d + one_day if d > end: break if only_monday and d.strftime('%w')!='1': continue range_.append(datetime.datetime.strftime(d, output_format)) return range_ |
【nginx】针对PC来访返回404
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# 如果来访用户,不是移动和蜘蛛,则返回404状态set $mobile_rewrite do_not_perform;if ($http_user_agent ~* "(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|spider|Spider|bot|Bot") { set $mobile_rewrite perform;}if ($http_user_agent ~* "^(1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-)") { set $mobile_rewrite perform;}if ($mobile_rewrite != perform) { return 404;} |
【python】新闻页正文抽取
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
# -*- coding: utf-8 -*-'''GoGo闯@流量贩子:新闻页正文抽取v1.0'''import requests,multiprocessing,re,sysimport MySQLdb as mdbreload(sys)sys.setdefaultencoding('utf-8')DBUG = 0reBODY =re.compile( r'<body.*?>([\s\S]*?)<\/body>', re.I)reBODY2 =re.compile( r'<script.*?>([\s\S]*?)<\/script>', re.I)reBODY3 = re.compile(r'<style.*?>([\s\S]*?)</style>',re.I)reBODY4 = re.compile(r'{[\s\S]*}',re.I)reCOMM = r'<!--.*?-->'def search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return dataclass Extractor(): def __init__(self, url = "", blockSize=3, timeout=5, image=False): self.url = url self.blockSize = blockSize self.timeout = timeout self.saveImage = image self.rawPage = "" self.ctexts = [] self.cblocks = [] def getRawPage(self): host = search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url)) headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control":"no-cache", "Connection":"keep-alive", "Host":host, "Pragma":"no-cache", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", } proxyHost = "proxy.abuyun.com" proxyPort = "9010" # 代理隧道验证信息 proxyUser = "天王盖地虎" proxyPass = "裤衩遮不住" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host" : proxyHost, "port" : proxyPort, "user" : proxyUser, "pass" : proxyPass, } proxies = { "http" : proxyMeta, "https" : proxyMeta, } try: f = requests.get(self.url,headers=headers,timeout=10) except Exception as e: raise e code = f.status_code content = f.content '''修改python2这个王八蛋使用request对网页编码误识别为iso-8859-1的BUG''' if f.encoding.lower() != 'utf-8': charset = re.compile(r'content="text/html;.?charset=(.*?)"').findall(content) coding = f.encoding.lower() print coding, f.headers['content-type'] try: if len(charset) > 0 and charset[0].lower() != coding: content = content.decode('gbk').encode('utf-8') elif coding == 'gbk' or coding == 'gb2312' or coding == 'iso-8859-1': content = content.decode('gbk').encode('utf-8') except: pass self.title = search("<title>([\s\S]*?)</title>",content).strip() return code,content def processTags(self): self.body = re.sub(reBODY, "", self.body) self.body = re.sub(reBODY2, "", self.body) self.body = re.sub(reBODY3,"", self.body) self.body = re.sub(reBODY4,"", self.body) self.body = re.sub(reCOMM, "", self.body) self.body = re.sub(r'<(?!p|/p)[^<>]*?>|下一篇.*','',self.body) self.body = re.sub(r'<p[^>]*?>','<p>',self.body) #self.body = re.sub(reTAG, "", self.body) self.body = re.sub(r'[\t\r\f\v]','',self.body) '''抽取图片''' self.img = search(r'<img[\s\S]*?src=[\'|"]([\s\S]*?)[\'|"][\s\S]*?>',self.body) if 'http' not in self.img: self.img = '<img src="%s%s" >' % (search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url)),self.img) def processBlocks(self): self.ctexts = self.body.split("\n") self.textLens = [len(text) for text in self.ctexts] self.cblocks = [0]*(len(self.ctexts) - self.blockSize - 1) lines = len(self.ctexts) for i in range(self.blockSize): self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks)) maxTextLen = max(self.cblocks) if DBUG: print(maxTextLen) self.start = self.end = self.cblocks.index(maxTextLen) while self.start > 0 and self.cblocks[self.start] > min(self.textLens): self.start -= 1 while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens): self.end += 1 content = "".join(self.ctexts[self.start:self.end]) return content def getContext(self): code, self.rawPage = self.getRawPage() self.body = re.findall(reBODY, self.rawPage)[0] if DBUG: print(code, self.rawPage) self.processTags() return self.title,self.processBlocks()def getZwIndex(url): # if __name__ == '__main__': ext = Extractor(url=url,blockSize=1, image=False) return ext.getContext() # pool = multiprocessing.Pool(processes=3)# for url in url_list:# pool.apply_async(getIndex, (url, ))# pool.close()# pool.join() |

浙公网安备 33010602011771号