程序开发常用轮子

这是一篇转载文章
【python】MD5生成

1
2
3
4
5
import hashlib
m2 = hashlib.md5()
m2.update(src)
 
print m2.hexdigest()

【python】CSV读写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
@@@@ 写入并生成csv文件 @@@@
# coding: utf-8
import csv
 
csvfile = file('csv_test.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(['姓名', '年龄', '电话'])
data = [
    ('小河', '25', '1234567'),
    ('小芳', '18', '789456')]
 
writer.writerows(data)
csvfile.close()
 
@@@@ 读取csv文件 @@@@
 
# coding: utf-8
import csv
csvfile = file('csv_test.csv', 'rb')
reader = csv.reader(csvfile)
 
for line in reader:
    print line
 
csvfile.close()

【shell】shell批量添加文件后缀名

1
for i in *; do mv "$i" "$i.txt"; done

【python】提取中文

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#coding: utf-8
 
import sys,re
reload(sys)
sys.setdefaultencoding('utf8')
 
s =   """
    en: Regular expression is a powerful tool for manipulating text.
    zh: 汉语是世界上最优美的语言,正则表达式是一个很有用的工具
    jp: 正規表現は非常に役に立つツールテキストを操作することです。
    jp-char: あアいイうウえエおオ
    kr:정규 표현식은 매우 유용한 도구 텍스트를 조작하는 것입니다.
    """
 
#unicode chinese
s = unicode(s)
re_words = re.compile(u"[\u4e00-\u9fa5]+")
m =  re_words.search(s,0)
print "unicode 中文"
print "--------"
print m
print m.group()
 
res = re.findall(re_words, s) 
     
# 查询出所有的匹配字符串
if res:
    print "There are %d parts:\n" % len(res)
    for r in res:
        print r
print "--------\n"

【python】Mysql读写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
@@@@ 写入Mysql @@@@
 
import sys,time,os,smtplib,MySQLdb
today = time.strftime('%Y-%m-%d',time.localtime(time.time()))
db = MySQLdb.connect("localhost","root","”,”seo_data",charset="utf8")
 
cursor = db.cursor()
sql = '''INSERT INTO bd_m_uv VALUES ("%s",%s)''' % (today,','.join(sql_bd_m_uv))
 
try:
    cursor.execute(sql)
    db.commit()
    print 'done'
except:
    db.rollback()
 
@@@@ 读取Mysql @@@@
import csv,re,sys
import MySQLdb as mdb
 
reload(sys)
sys.setdefaultencoding('utf8')
 
con = mdb.connect('localhost','root','','url_push',charset='utf8');
with con:
    cur = con.cursor()
    cur.execute("select * from hx")
    numrows = int(cur.rowcount)
    for i in range(numrows):
        row = cur.fetchone()
        print row[0]

【python】判断当前字符串是否全部为中文

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#coding:utf-8
 
import sys
reload(sys)
sys.setdefaultencoding('utf8')
 
def check_contain_chinese(check_str):
    n = 0
    m = 0
    for ch in check_str.decode('utf-8'):
        if u'\u4e00' <= ch <= u'\u9fff':
            n += 1
        else:
            m += 1
    if m == 0:
        return 1
    else:
        return 0

【python】计算正文字数

1
2
3
4
5
6
'''计算正文字数'''
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),newcontent)
 
text2 = re.sub('<[^>]*?>','',text) 
 
words_number = len(text2)

【python】html块去除杂乱标签

1
2
3
4
5
6
7
8
9
10
11
# text为正文变量
 
a = re.sub(r'<script.*?>([\s\S]*?)<\/script>','',text)
 
b = re.sub(r'<style.*?>([\s\S]*?)</style>','',a)
 
c = re.sub(r'{[\s\S]*}','',b)
 
d = re.sub(r'<(?!p|img|/p)[^<>]*?>','',c).strip()   #将除p和img之外的标签清空,且去除正文开头结尾的换行,并把单引号换成双引号
 
e = re.sub(r'<p[^>]*?>','<p>',d)     #格式化p标签

【python】将反斜杠u类型(\uXXXX)的字符串,转换为对应的unicode的字符串

1
2
3
4
slashUStr = "\\u0063\\u0072\\u0069\\u0066\\u0061\\u006E\\u0020\\u5728"
decodedUniChars = slashUStr.decode("unicode-escape")
 
print "decodedUniChars=",decodedUniChars

【python】json、dict转化

1
2
3
4
5
6
7
8
9
import simplejson
 
# JSON转化为字典
json_2_dict = simplejson.loads(user)
print json_2_dict
 
#字典转化为JSON字符串
dict_2_jsonstr = simplejson.dumps(json_2_dict)
print dict_2_jsonstr

【python】通过UA识别wap还是pc来访

1
2
3
4
5
6
7
8
9
10
11
def getUA(ua):
    reg_b = re.compile(r"(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\\.(browser|link)|vodafone|wap|windows ce|xda|xiino", re.I|re.M)
 
    reg_v = re.compile(r"1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\\-(n|u)|c55\\/|capi|ccwa|cdm\\-|cell|chtm|cldc|cmd\\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\\-s|devi|dica|dmob|do(c|p)o|ds(12|\\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\\-|_)|g1 u|g560|gene|gf\\-5|g\\-mo|go(\\.w|od)|gr(ad|un)|haie|hcit|hd\\-(m|p|t)|hei\\-|hi(pt|ta)|hp( i|ip)|hs\\-c|ht(c(\\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\\-(20|go|ma)|i230|iac( |\\-|\\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\\/)|klon|kpt |kwc\\-|kyo(c|k)|le(no|xi)|lg( g|\\/(k|l|u)|50|54|\\-[a-w])|libw|lynx|m1\\-w|m3ga|m50\\/|ma(te|ui|xo)|mc(01|21|ca)|m\\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\\-2|po(ck|rt|se)|prox|psio|pt\\-g|qa\\-a|qc(07|12|21|32|60|\\-[2-7]|i\\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\\-|oo|p\\-)|sdk\\/|se(c(\\-|0|1)|47|mc|nd|ri)|sgh\\-|shar|sie(\\-|m)|sk\\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\\-|v\\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\\-|tdg\\-|tel(i|m)|tim\\-|t\\-mo|to(pl|sh)|ts(70|m\\-|m3|m5)|tx\\-9|up(\\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\\-|your|zeto|zte\\-", re.I|re.M)
 
    b = reg_b.search(ua)
    v = reg_v.search(ua[0:4])
    if b or v:
        return 'wap'
    else:
        return 'pc'

【linux】根据文件名杀死进程

1
ps auxf|grep 'zq2.py'|grep -v grep|awk '{print $2}'|xargs kill -9

【python】日期遍历

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import datatime,time
 
def date_range(start, end, only_monday=False, input_format='%y%m%d', output_format='%y%m%d'):
     '''如print date_range(140130, 140202)
     输出['140130', '140131', '140201', '140202']
     '''
    start = str(start)
    end = str(end)
    start = datetime.datetime.strptime(start, input_format)
    end = datetime.datetime.strptime(end, input_format)
    one_day = datetime.timedelta(days=1)
 
    range_ = []
    d = start - one_day    
 
    while 1:
        d = d + one_day         
        if d > end:           
            break       
        if only_monday and d.strftime('%w')!='1':           
            continue
        range_.append(datetime.datetime.strftime(d, output_format))   
    return range_

【nginx】针对PC来访返回404

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 如果来访用户,不是移动和蜘蛛,则返回404状态
set $mobile_rewrite do_not_perform;
 
if ($http_user_agent ~* "(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|spider|Spider|bot|Bot") {
  set $mobile_rewrite perform;
}
 
if ($http_user_agent ~* "^(1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-)") {
  set $mobile_rewrite perform;
}
 
if ($mobile_rewrite != perform) {
    return 404;
}

【python】新闻页正文抽取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
 
'''GoGo闯@流量贩子:新闻页正文抽取v1.0'''
 
import requests,multiprocessing,re,sys
import MySQLdb as mdb
 
reload(sys)
sys.setdefaultencoding('utf-8')
 
DBUG   = 0
 
reBODY =re.compile( r'<body.*?>([\s\S]*?)<\/body>', re.I)
reBODY2 =re.compile( r'<script.*?>([\s\S]*?)<\/script>', re.I)
reBODY3 = re.compile(r'<style.*?>([\s\S]*?)</style>',re.I)
reBODY4 = re.compile(r'{[\s\S]*}',re.I)
reCOMM = r'<!--.*?-->'
 
def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data
 
class Extractor():
    def __init__(self, url = "", blockSize=3, timeout=5, image=False):
        self.url       = url
        self.blockSize = blockSize
        self.timeout   = timeout
        self.saveImage = image
        self.rawPage   = ""
        self.ctexts    = []
        self.cblocks   = []
 
    def getRawPage(self):
 
        host = search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url))
 
        headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, sdch",
            "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            "Host":host,
            "Pragma":"no-cache",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
        }
 
        proxyHost = "proxy.abuyun.com"
        proxyPort = "9010"
 
        # 代理隧道验证信息
        proxyUser = "天王盖地虎"
        proxyPass = "裤衩遮不住"
 
        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
          "host" : proxyHost,
          "port" : proxyPort,
          "user" : proxyUser,
          "pass" : proxyPass,
        }
 
        proxies = {
            "http"  : proxyMeta,
            "https" : proxyMeta,
        }
 
        try:
            f = requests.get(self.url,headers=headers,timeout=10)
        except Exception as e:
            raise e
 
        code = f.status_code
        content = f.content
 
        '''修改python2这个王八蛋使用request对网页编码误识别为iso-8859-1的BUG'''
        if f.encoding.lower() != 'utf-8':
            charset = re.compile(r'content="text/html;.?charset=(.*?)"').findall(content)
            coding = f.encoding.lower()
            print coding, f.headers['content-type']
            try:
                if len(charset) > 0 and charset[0].lower() != coding:
                    content = content.decode('gbk').encode('utf-8')
                elif coding == 'gbk' or coding == 'gb2312' or coding == 'iso-8859-1':
                    content = content.decode('gbk').encode('utf-8')
            except:
                pass
        self.title = search("<title>([\s\S]*?)</title>",content).strip()
        return code,content
 
    def processTags(self):
 
        self.body = re.sub(reBODY, "", self.body)
        self.body = re.sub(reBODY2, "", self.body)
        self.body = re.sub(reBODY3,"", self.body)
        self.body = re.sub(reBODY4,"", self.body)
        self.body = re.sub(reCOMM, "", self.body)
        self.body = re.sub(r'<(?!p|/p)[^<>]*?>|下一篇.*','',self.body)
        self.body = re.sub(r'<p[^>]*?>','<p>',self.body)
        #self.body = re.sub(reTAG, "", self.body)
        self.body = re.sub(r'[\t\r\f\v]','',self.body)
 
        '''抽取图片'''
        self.img = search(r'<img[\s\S]*?src=[\'|"]([\s\S]*?)[\'|"][\s\S]*?>',self.body)
        if 'http' not in self.img:
            self.img = '<img src="%s%s" >' % (search('^([^/]*?)/',re.sub(r'(https|http)://','',self.url)),self.img)
 
    def processBlocks(self):
        self.ctexts   = self.body.split("\n")
        self.textLens = [len(text) for text in self.ctexts]
 
        self.cblocks  = [0]*(len(self.ctexts) - self.blockSize - 1)
        lines = len(self.ctexts)
        for i in range(self.blockSize):
            self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks))
 
        maxTextLen = max(self.cblocks)
 
        if DBUG: print(maxTextLen)
 
        self.start = self.end = self.cblocks.index(maxTextLen)
        while self.start > 0 and self.cblocks[self.start] > min(self.textLens):
            self.start -= 1
        while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens):
            self.end += 1
 
        content = "".join(self.ctexts[self.start:self.end])
        return content
 
    def getContext(self):
        code, self.rawPage = self.getRawPage()
        self.body = re.findall(reBODY, self.rawPage)[0]
 
        if DBUG: print(code, self.rawPage)
 
        self.processTags()
        return self.title,self.processBlocks()
 
def getZwIndex(url):
    # if __name__ == '__main__':
    ext = Extractor(url=url,blockSize=1, image=False)
    return ext.getContext()
 
 
# pool = multiprocessing.Pool(processes=3)
# for url in url_list:
#     pool.apply_async(getIndex, (url, ))
# pool.close()
# pool.join()

 

posted @ 2017-04-13 17:07  l4617  阅读(278)  评论(0)    收藏  举报