什么乱七八糟的备份

|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
#-------------------------shell-------------------------#两文件合并列paste -d " " file1 file2 > mergefile#第N列符合某条件的数字求和cat file|awk '{if($N~"regex")print $E}'|awk {sum+=$0}END{print sum}orcat file|awk '{s[$1]+=$2}END{for(i in s){print i,s[i]}}'#curl中文乱码(gzip压缩导致)curl -H "Accept-Encoding: gzip" www.domain.net|gunzip|more#文件分割split -8000 file -d -a 1 word#2个文件,每个2列,将他们按照第一列相同的数,来合并成一个三列的文件,同时,将每个文件中针对第一列对应第二列中没有的数补0awk 'FNR==NR{a[$1]=$2}FNR<NR{a[$1]?a[$1]=a[$1]" "$2:a[$1]=a[$1]" 0 "$2}END{for(i in a)print i,a[i]}' file1 file2 > file3orawk -F"," 'NR==FNR{a[$1]=$2;next}{print $0","a[$2]}' file1 file2|sed -E 's/,$/,0/g'#换行符|回车符替换(\n \r \n\r)perl -p -e 's/\n//' filename |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
#-------------------------aliyun/mysql-------------------------#登陆ssh -i {keyfile} -l work ***.***.***.***#FTPscp -i {keyfile} {filename} work@***.***.***.***:/{directory}mysql -u{***} -p{***} -h{***} #登陆mysqlshow databases; #显示所有数据库use kanzhun #查看××库show tables; #显示库中的所有表describe table_name #显示××表的数据结构#随机抽取5000条记录select id from company order by rand() limit 5000;#多表关联查询select company.full_name,city.name from city,company,company_salary where company_salary.company_id = company.id and company_salary.city_code = city.code order by rand() limit 5000;#正则查找select infor from daoru where infor regexp ‘^&’;# 替换update 表名 set 字段名=replace(字段名,',',',')# 替换字段中的换行符update 表名 set 字段名= replace(字段名,char(13)+char(10),'');update 表名 set 字段名= replace(字段名,char(13),'');update 表名 set 字段名= replace(字段名,char(10),''); |
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#-------------------------python-------------------------#---网页采集---import pycurl,StringIO,sys,chardet,random,requests,urllib,time,urllib2def getHead(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', ...... ] headers = random.choice(uaList) return headers#代理ip可用性验证def proxycheckone(proxy): url='http://www.baidu.com/s?wd=python' proxy_url = proxy proxy_support = urllib2.ProxyHandler({'http': proxy_url}) #设定使用proxy opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) r=urllib2.Request(url) #r.add_header("Accept-Language","utf-8") #加入头信息,这样可避免403错误 #r.add_header("Content-Type","text/html; charset=utf-8") #r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)") trycount=1 while trycount<=2: #尝试2次 try: T0=time.time() #开始时间 f=urllib2.urlopen(r,timeout=2) data=f.read() if '百度搜索' in data and 'http://verify.baidu.com' not in data: #判断到网页信息中有baidu字符,说明通过此proxy连接baidu可以成功 T=time.time()-T0 #得出最终连接所用时间 break else:return [] except: time.sleep(1) trycount=trycount+1 if trycount>2: return [] else: print '地址:'+proxy+' 连接速度:'+str(T) #此信息会写入一个文本文件 return proxydef daili_ip(dailistr): daili_list = dailistr.split(',') ip = random.choice(daili_list) return ipdef getHtml(url,headers,ip): while 1: try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.USERAGENT,headers) c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #c.setopt(pycurl.HTTPHEADER,["Accept-Encoding:gzip,deflate,sdch"]) #c.setopt(pycurl.HTTPHEADER,header_list) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 html = c.fp.getvalue() return html except: continuefor ip in daili_list: ceshi_ip_a = proxycheckone(ip) daili_list_str.append(ceshi_ip_a)daili_str = ','.join(daili_list_str)for url in open('url.txt'): content = getHtml(url,getHead(),daili_ip(dailistr)) typeEncode = sys.getfilesystemencoding() infoencode = chardet.detect(content).get('encoding','utf-8') html = content.decode(infoencode,'ignore').encode(typeEncode) print html#---下载图片---import urllib,osfilepath=os.getcwd()if os.path.exists(filepath) is False: os.mkdir(filepath) x=1 print u'爬虫准备就绪...' for line in open('logo_url.txt'): line = line.strip() id = line.split(',')[1] imgurl = line.split(',')[2] temp= '%s.jpg' % id print u'正在下载第%s张图片' % x print imgurl try: urllib.urlretrieve(imgurl,temp) x+=1 except: continueprint u'图片下载完毕,保存路径为'+filepath |
来自于闯哥www.kaopuseo.com

浙公网安备 33010602011771号