对数据html文本 的处理
对数据html文本 的处理 :
提取文字、图片、分句
'''
SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2;
select top y * from 表 where 主键 not in(select top (x-1)*y 主键 from 表)
如果表中无主键,可以用临时表,加标识字段解决.这里的x,y可以用变量.
select id=identity(int,1,1),* into #tb from 表
select * from #tb where id between (x-1)*y and x*y-1
select top 1000 Info_ID from Info_Roles
select top 2000 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where Info_ID not in( select top 1000 Info_ID from Info_Roles ) ;
select top 399 Info_ID,',xiaole20180410SPLIT,',UPPER(content) from Info_Content ;
select top 399 CHARINDEX('IMG',UPPER(content)) from Info_Content ;
select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where CHARINDEX('IMG',UPPER(content))>0;
select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where
Info_ID in( select top 1000 Info_ID from Info_Roles ) and
CHARINDEX('IMG',UPPER(content))>0
;
SELECT
TOP 15 Info_ID,
',xiaole20180410SPLIT,',
content
FROM
Info_Content
WHERE
Info_ID IN (
SELECT
TOP 1000 Info_ID
FROM
Info_Roles
WHERE
Flag = 1
)
AND CHARINDEX('IMG', UPPER(content)) > 0;
SELECT
TOP 200 Info_ID,
',xiaole20180410SPLIT,',
content
FROM
Info_Content
WHERE
Info_ID IN (
SELECT
TOP 90000 Info_ID
FROM
Info_Roles
)
AND CHARINDEX('<IMG', UPPER(content)) > 0;
'''
from bs4 import BeautifulSoup
from selenium import webdriver
xlsplit_str = ',xiaole20180410SPLIT,'
f_db_txt, uid_d = 'db.uid.para.txt', {}
with open(f_db_txt, 'r', encoding='utf-8') as fr:
for i in fr:
i = i.replace('\t', '').replace('\n', '')
if xlsplit_str in i:
l = i.split(xlsplit_str)
uid = l[0].replace(' ', '')
uid_d[uid] = {}
uid_d[uid]['html'] = []
uid_d[uid]['html'].append(l[1])
else:
uid_d[uid]['html'].append(i)
r_d = {}
for uid in uid_d:
str_ = ''.join(uid_d[uid]['html'])
fhtml = 'qqzong.vedio.allinone.tmp.html'
with open(fhtml, 'w', encoding='utf-8') as fw:
fw.write(str_)
with open(fhtml, 'r', encoding='utf-8') as fo:
soup = BeautifulSoup(fo, 'html.parser')
img_l = soup.find_all('img')
if len(img_l) > 0:
l = soup.find_all('img')
uid_d[uid]['img'], uid_d[uid]['txt'] = [i.attrs['src'] for i in l], soup.text.replace(' ','').replace(' ','').replace(' ','')
r_d[uid] = {}
r_d[uid] = uid_d[uid]
incr_l = ['http://www.51g3.net/templates/images/logo.jpg',
'http://www.51g3.net/attached/image/20171206104541_20247.jpg',
'http://www.51g3.net/attached/image/20171129183441_78749.png',
'http://www.51g3.net/templates/images/agentimg.jpg']
r_d[uid]['img'] += incr_l
else:
print(uid)
pass

浙公网安备 33010602011771号