GISAID序列数据-验证码识别API

GISAID的验证码很烦人吗?

用下面这段“python代码+字符集”解决它(识别率95%)


API: CharOcr的Ocr(filename),对filename指定的png图片进行识别。

需要字符集文件夹“../charsPrecise”提供识别资源。

 

字符集文件“charsPrecise”要放在代码文件的同级目录;

代码底部有示例。

1.

import time
from threading import Thread, Lock
from skimage import io
import os
import PIL.Image as Image


#  二值化
def Towb(image):
    threshold = 100
    for i in range(len(image)):
        for j in range(len(image[i])):
            for k in range(len(image[i, j])):
                if image[i, j, k] < threshold:
                    image[i, j, k] = 0
                else:
                    image[i, j, k] = 255


#  裁剪到不留白
def cutData(data):
    xmin, ymin, xmax, ymax = 100000, 100000, -10000, -10000
    datax = len(data)
    datay = len(data[0])
    #  获取边界值
    for i in range(datax):
        for j in range(datay):
            if data[i, j, 0] == 0:
                #  黑色像素点
                if i < xmin:
                    xmin = i
                if i > xmax:
                    xmax = i
                if j < ymin:
                    ymin = j
                if j > ymax:
                    ymax = j
    #  裁剪
    data = data[xmin:xmax + 1, ymin:ymax + 1]
    return data


#  字母顺序先后规则
def CharOrder(char1, char2):
    rawList = ['2', '3', '4', '5', '6', '7', '8', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'k', 'm', 'n', 'p', 'r', 'w',
               'x', 'y']
    easyList = ['2', '3', '7', 'c', 'n', 'r', 'x', 'f']
    thirdEasyList = ['r']

    rightChar = str('rightChar')
    #  首先确保字符是两个列表中的
    if char1 not in rawList or char2 not in rawList:
        return '字符不在检查范围内!'

    def ifIn(char1, char2, lis):
        return char1 in lis and char2 not in lis

    def ise(c1, c2):
        if ifIn(c1, c2, thirdEasyList) or ifIn(c1, c2, easyList):
            return True

    if ise(char1, char2):
        rightChar = char2
    elif ise(char2, char1):
        rightChar = char1
    else:
        rightChar = char1

    return rightChar


def transChar(c):
    c = c[0]
    return c


class CharOcr:
    def __init__(self, charsdir='./charsPrecise'):
        self.charsdir = charsdir
        if not os.path.exists(self.charsdir):
            os.mkdir(self.charsdir)
        self.load()

    #  字符数组列表 [[name,array],]
    CharArrayList = list()

    #  储存字符图片
    def store(self, CharName, CharArray):
        imgToStore = Image.fromarray(CharArray)
        imgToStore.save(self.charsdir + CharName + '.png')

    #  加载指定名字的字符数组到CharArrayList
    def loadOne(self, CharName):
        CharArray = io.imread(self.charsdir + '/' + CharName + '.png')
        #  二值化数组
        Towb(CharArray)
        #  裁剪不留白
        CharArray = cutData(CharArray)
        self.CharArrayList.append([CharName, CharArray])

    #  加载目录下的所有图片为字符数组
    def load(self):
        for file in os.listdir(self.charsdir):
            filename = os.path.splitext(file)
            if filename[-1] == '.png':
                name = filename[0]
                self.loadOne(name)

    #   匹配一个字符,并描成黄色
    def findOneChar(self, TargetArray, CharArray):
        # plt.subplot(1,2,1)
        # plt.imshow(CharArray)
        answerList = list()
        data = CharArray
        datax = len(data)
        datay = len(data[0])

        #  对比两个相同大小数组中,thedata中是否包含data
        def oneCheck(thedata):
            #  允许的最大不同像素数
            maxWrong = 0
            counter = 0
            for i in range(datax):
                for j in range(datay):
                    if data[i, j, 0] == 0:
                        #  字符数据为黑色
                        if thedata[i, j, 0] == 255:
                            #  而目标数据为白色
                            if counter < maxWrong:
                                counter += 1
                            else:
                                return False
            return True

        # 对验证码图片特定部位进行扫描
        for ti in [23, 24, 30, 31]:
            # 第24、25、31、32
            for tj in range(11, 138 - datay + 1):
                #  12~138
                ifSame = oneCheck(TargetArray[ti:ti + datax, tj:tj + datay])
                if ifSame:
                    answerList.append((ti, tj))

        def onedraw(datatocheck, x, y):
            for i in range(datax):
                for j in range(datay):
                    if data[i, j, 0] == 0:
                        #  绘制成绿色
                        datatocheck[x + i, y + j, 0] = 0
                        datatocheck[x + i, y + j, 1] = 255
                        datatocheck[x + i, y + j, 2] = 0

        for cord in answerList:
            onedraw(TargetArray, cord[0], cord[1])

        return answerList

    #  找到所有可能的字符
    def findChar(self, TargetArray):
        #  结果字典{x:name,}
        AnswerDict = dict()
        lock = Lock()

        #  单个字符扫描
        def DoOneChar(Name, CharArray):
            answerList = self.findOneChar(TargetArray, CharArray)
            if len(answerList) != 0:
                for y, x in answerList:
                    #  如果有结果,将[name,x]加入theAnswer
                    lock.acquire()
                    if x in AnswerDict:
                        AnswerDict[x] = CharOrder(AnswerDict[x], Name[0])
                    else:
                        AnswerDict[x] = Name[0]
                    lock.release()

        #  每个字符都试一遍
        for Name, CharArray in self.CharArrayList:
            thread = Thread(target=DoOneChar, args=(Name, CharArray))
            thread.start()
            thread.join()

        #  结果横坐标储存列表
        xList = list()

        #  横坐标计算列表
        ansList = list()
        for x in AnswerDict:
            ansList.append(x)
        ansList.sort()

        #  最终字符
        answer = ''

        def getFinalStr(maxSep=15):
            answer = str('')
            #  计算正确排序
            for i in range(len(ansList)):
                # print('xList:'+str(xList))
                if i == 0:
                    #  第一个识别出来的元素,默认为正常识别
                    answer += AnswerDict[ansList[0]]
                    xList.append(ansList[0])
                elif ansList[i] - xList[-1] > maxSep:
                    #  如果和前一个字符距离大于定值,为正常识别,加入最终字符串
                    answer += AnswerDict[ansList[i]]
                    xList.append(ansList[i])
                else:
                    #  和前一个字符距离小于定值,算法解决
                    char1 = AnswerDict[ansList[i]]
                    char2 = AnswerDict[xList[-1]]
                    finalchar = CharOrder(char2, char1)
                    if finalchar == char1 and char1 != char2:
                        #  新字符是正确的
                        xList[-1] = ansList[i]
                        answer = answer[:-1]
                        answer += char1
            return answer

        if len(AnswerDict) <= 5:
            print('len<5')
            answer = getFinalStr()
        else:
            maxSep = 15
            for i in range(3):
                answer = getFinalStr(maxSep)
                length = len(answer)
                if length == 5:
                    break
                elif length > 5:
                    maxSep += 1
                    answer = getFinalStr(maxSep)
                else:
                    maxSep -= 1
                    answer = getFinalStr(maxSep)

        return answer

    #  识别图片中的字符
    def Ocr(self, TargetFileName):
        TargetArray = io.imread(TargetFileName)
        #  二值化目标图片
        Towb(TargetArray)
        #  裁剪目标图片到不留白
        # TargetArray = cutData(TargetArray)决定不裁剪
        answer = self.findChar(TargetArray)
        return answer


if __name__ == '__main__':
    #  测试
    ocrr = CharOcr()
    t1 = time.time()
    ans = ocrr.Ocr('test4.png')
    t2 = time.time()
    print('用时:', t2 - t1, 's')
    print(ans)
View Code

2.

字符数据文档压缩包地址:https://files.cnblogs.com/files/blogs/670686/charsPrecise.zip

 

原理是像素匹配;

优化了逻辑,一张图片1秒不到就能识别出来。

字符集是我自己用PIL等包加tkinter逐个提取的,忙了好几天。

缺点是字符集提取得还是不够好,导致有"r, n"等的图片可能识别失败(剩下5%的错误率就是这么来的),但已经够用了。

 

更多资源请访问我的主页,还有gisaid fasta文件、病例数据爬虫。

 

posted @ 2021-11-10 12:53  小鱼圆又圆  阅读(320)  评论(0)    收藏  举报