对数据html文本的处理

对数据html文本的处理：
提取文字、图片、分句
'''
SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2;


 select   top   y   *   from   表   where   主键   not   in(select   top   (x-1)*y   主键   from   表)



  如果表中无主键,可以用临时表,加标识字段解决.这里的x,y可以用变量.

  select   id=identity(int,1,1),*     into   #tb   from   表
  select   *   from   #tb   where   id   between   (x-1)*y   and   x*y-1




 select   top   1000   Info_ID   from   Info_Roles
 select   top   2000   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where   Info_ID   not   in( select   top   1000   Info_ID   from   Info_Roles   )   ;
 select   top   399   Info_ID,',xiaole20180410SPLIT,',UPPER(content)   from   Info_Content      ;
 select   top   399   CHARINDEX('IMG',UPPER(content))   from   Info_Content      ;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where  CHARINDEX('IMG',UPPER(content))>0;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where
 Info_ID      in( select   top   1000   Info_ID   from   Info_Roles   )  and
  CHARINDEX('IMG',UPPER(content))>0
 ;



SELECT
	TOP 15 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 1000 Info_ID
		FROM
			Info_Roles
		WHERE
			Flag = 1
	)
AND CHARINDEX('IMG', UPPER(content)) > 0;





SELECT
	TOP 200 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 90000 Info_ID
		FROM
			Info_Roles
	)
AND CHARINDEX('<IMG', UPPER(content)) > 0;



'''

from bs4 import BeautifulSoup
from selenium import webdriver

xlsplit_str = ',xiaole20180410SPLIT,'
f_db_txt, uid_d = 'db.uid.para.txt', {}
with open(f_db_txt, 'r', encoding='utf-8') as fr:
    for i in fr:
        i = i.replace('\t', '').replace('\n', '')
        if xlsplit_str in i:
            l = i.split(xlsplit_str)
            uid = l[0].replace(' ', '')
            uid_d[uid] = {}
            uid_d[uid]['html'] = []
            uid_d[uid]['html'].append(l[1])
        else:
            uid_d[uid]['html'].append(i)

r_d = {}
 


for uid in uid_d:
    str_ = ''.join(uid_d[uid]['html'])
    fhtml = 'qqzong.vedio.allinone.tmp.html'
    with open(fhtml, 'w', encoding='utf-8') as fw:
        fw.write(str_)
    with open(fhtml, 'r', encoding='utf-8') as fo:
        soup = BeautifulSoup(fo, 'html.parser')
        img_l = soup.find_all('img')
        if len(img_l) > 0:
            l = soup.find_all('img')
            uid_d[uid]['img'], uid_d[uid]['txt'] = [i.attrs['src'] for i in l], soup.text.replace(' ','').replace('    ','').replace(' ','')
            r_d[uid] = {}
            r_d[uid] = uid_d[uid]
            incr_l = ['http://www.51g3.net/templates/images/logo.jpg',
                      'http://www.51g3.net/attached/image/20171206104541_20247.jpg',
                      'http://www.51g3.net/attached/image/20171129183441_78749.png',
                      'http://www.51g3.net/templates/images/agentimg.jpg']
            r_d[uid]['img'] += incr_l
        else:
            print(uid)
            pass
posted @ 2018-04-10 16:23 papering 阅读(338) 评论(0) 收藏举报
刷新页面返回顶部
papering

对数据html文本 的处理

对数据html文本的处理