python图片下载&本地图片对比

  管理系统用户头像出现 未识别图像 , 即cdn路径下的图片文件正常,只是较多图片为大小5k的小图片,现需找出这些图片对应的用户。

  注:本博只是为避免将来重复造轮子,不做额外赘述

  思路

  下载 id - img 一一对应的图片到本地

  通过opencv,图片文件读取为Mat矩阵,转灰度图,获取dhash,发现未识别图片对应的dhash距离小于1

  故,基于dhash完成图片对应用户id的筛选即可

  图片爬取

  页面爬取

#! /usr/bin/env python
# -*- coding:utf-8 -*-
# __author__ = "NYA"

import urllib
import urllib.request
from bs4 import BeautifulSoup
import sys
import chardet


def download(url, path):
    req = urllib.request.Request(url)
    content = urllib.request.urlopen(req).read()
    typeEncode = sys.getfilesystemencoding()
    infoencode = chardet.detect(content).get('encoding', 'utf-8')
    html = content.decode(infoencode, 'ignore').encode(typeEncode)

    soup = BeautifulSoup(html, 'html.parser')
    table_tr = soup.select('table > tr')
    print(len(table_tr))
    index = 0
    for tr_td in table_tr:
        if index > 0:
            td_index=0
            name = ''
            url = ''
            for td in tr_td.findAll('td'):
                # print(td)
                if td_index==0:
                    name=td.getText()
                if td_index==4:
                    url=td.getText()
                td_index = td_index + 1
            local_path = path + name
            print(url)
            urllib.request.urlretrieve(url, local_path)
        index = index + 1

url = 'aaa'
path = 'G:\images\\used\\'
download(url, path)

  csv爬取

#!/usr/bin/python3
# -*- coding:utf8 -*-

import imghdr
import urllib.request

path = 'G:\images\\used\\'
csv_path = 'G:\images\\robot1.csv'
f = open('G:\images\\check_error.txt', 'wb')

data = []
error_images = []
with open(csv_path, 'r', encoding="utf-8") as f:
    header = f.readline().split(',')
    counter = 0
    for line in f:
        try:
            fields = line.split(",")
            name = fields[0]
            url = fields[2]
            local_path = path + name + '.jpg'
            print(url.strip())
            print(local_path)
            urllib.request.urlretrieve(url, local_path)
            check = imghdr.what(local_path)
            if check == None:
                f.write(name)
                f.write(',')
                f.write(url)
                f.write(',')
                f.write(local_path)
                f.write('\n')
                error_images.append(url)
        except:
            print(line)
        print(counter)
        counter = counter + 1
print(counter)
print(error_images)

  dhash筛选

public class ImageCheck {

    static {
        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
    }

    public static void main(String[] args) throws IOException {
//        Mat mat = Imgcodecs.imread("G:\\images\\used\\4.jpg");
//        Mat mat1 = Imgcodecs.imread("G:\\images\\used\\7.jpg");
//        Integer dhashGRAY28 = DhashDetector.getDhashGRAY28(mat);
//        Integer dhashGRAY281 = DhashDetector.getDhashGRAY28(mat1);
//        System.out.println(dhashGRAY28);
//        System.out.println(dhashGRAY281);

        File file = new File("G:\\images\\used");
        File[] tempFiles = file.listFiles();
        List<String> params = new ArrayList<>();

        Integer dhash = 268423200;
        for (int i = 0 ; i < tempFiles.length;i++) {
            if (tempFiles[i].isFile()){
                params.add(tempFiles[i].getPath());
            }
        }

        BufferedWriter bwScdPca = new BufferedWriter(new FileWriter("G:\\images\\error1.txt"));
        List<Integer> res = new ArrayList<>();
        for (String path:params) {
            String[] split = path.split("\\\\");
            String now = split[split.length - 1].replace(".jpg", "");
            System.out.println(now);
            Mat mat = Imgcodecs.imread(path);
            try {
                Integer dhashGRAY28 = DhashDetector.getDhashGRAY28(mat);
                if (DhashDetector.calcHammingDistance(dhash,dhashGRAY28) < 2) {
                    res.add(Integer.parseInt(now));
                }
            } catch (Exception e) {
                System.out.println(e);
            }

            mat.release();
        }
        res = res.stream().sorted().collect(Collectors.toList());
        System.out.println(res);
        for (Integer re :
                res) {
            bwScdPca.write(re+"");
            bwScdPca.newLine();
            bwScdPca.flush();
        }

        bwScdPca.close();
    }

}

 

posted @ 2019-05-06 17:26  来兮子宁  阅读(589)  评论(0编辑  收藏  举报