利用神经网络识别12306验证码——（二）收集文字验证码图片

原创已于 2024-05-18 17:40:46 修改 · 807 阅读

CC 4.0 BY-SA版权

文章标签：

于 2021-03-28 21:50:52 首次发布

惊喜.jpg——给大家分享一份我整理的粗糙的数据集
网盘链接
提取码：n51k
这部分数据样本有点少，拿去训练的话准确率比较低，只能达到50%左右。
12306验证码图片包含两部分，一部分是上边的文字，一部分是下面的图片在这里插入图片描述
对于文字部分，可以直接从上面的验证码图片抠出来。所以，先爬个几千张验证码图片，程序如下。

import requests
import base64
import time
import random
import os

path = 'D:/captcha/'
os.mkdir(path)  #创建文件夹
sleep_lst = [0.5, 0.8, 0.7, 0.6, 1, 1.5, 2]
cap_url = 'https://kyfw.12306.cn/passport/captcha/captcha-image64'
for i in range(1, 5001):
    response = requests.get(cap_url)
    if response.status_code == 200:
        json = response.json()
        img = json['image']
        img = base64.b64decode(img)
        picture_name = str(i) + '.jpg'
        with open(path + picture_name, 'wb') as f:
            f.write(img)
    time.sleep(random.choice(sleep_lst))

我这里也给大家分享一份之前下载的9000多张验证码。不过还是建议先写程序试试。
12306验证码网盘链接
提取码：n52k
图片下载好之后，就可以开始抠文字了，文字部分也有两种情况：1、一个词，2、两个词；根据区域长度来判断是哪种情况。程序如下

from PIL import Image
import numpy as np
import os
#import shutil

#判断文字部分词的数量
def judge_image_bk(raw_image):
    if isinstance(raw_image, str):
        raw_image = Image.open(raw_image)
    try:
        image = raw_image.crop((118, 0, 230, 28))
    except:
        return
    image = image.convert('P')
    image_array = np.asarray(image)

    image_array = image_array[24:28]
    if np.mean(image_array) > 200:
        return 1
    else:
        return 2

#把文字抠出来
def split_image_text(raw_image, image_shape, name, save_dir, mode=1):
    if isinstance(raw_image, str):
        raw_image = Image.open(raw_image)

    image = raw_image.crop((118, 0, 230, 28))
    res = []
    if mode == 1:
        image_array = np.asarray(image)
        image_array = image_array[6:22]
        image_array = np.mean(image_array, axis=2)
        image_array = np.mean(image_array, axis=0)
        image_array = np.reshape(image_array, [-1])
        indices = np.where(image_array < 240)
        res.append([indices[0][0], indices[0][-1]])

    if mode == 2:
        image_p = image.convert('P')
        image_array = np.asarray(image_p)
        image_array = image_array[6:22]
        image_array = np.mean(image_array, axis=0)
        avg_image = np.reshape(image_array, [-1])
        indices = np.where(avg_image < 190)
        start = indices[0][0] - 1
        end = indices[0][0] - 1
        for i in indices[0]:
            if i == end + 1:
                end = i
            else:
                if end - start > 10:
                    res.append([start+1, end])
                start = i
                end = i
        if end - start > 10:
            res.append([start+1, end])
    text = [image.crop((x1, 0, x2, 28)).resize(image_shape) for x1, x2 in res]
    
    for i in range(len(text)):
        text[i].save(os.path.join(os.getcwd(), save_dir, name[i] + '.jpg'))
    #return text
    
save_dir = 'text_captcha'	#保存文本图片的文件夹
os.mkdir(os.path.join(os.getcwd(),save_dir))

items = os.listdir('D:/captcha')  #获取前面下载的5000张验证码图片
count = 0
for item in items:
    count += 1
    try:
        mode = judge_image_bk('D:/captcha/' + item)
        print('当前图片词数量：', mode)
        name = [count] if mode == 1 else [count, count+1]
        count = count if mode == 1 else count + 1
        split_image_text('D:/captcha/' + item, (64, 64), name, save_dir, mode)
    except:
        print('图片错误：', item)

不出意外的话，文字图片已经抠出来了。下图是我标注过的文字图片部分。在这里插入图片描述
另外，当文本是两个词的时候，图片底色是灰色，比如下面的验证码图片，和文本的颜色十分接近，这时可以采用二值化的方法去除图片底色，不过因为底色和字体颜色很接近，二值化的效果也不是很好。介绍两种方法
在这里插入图片描述
第一种，利用cv2的自适应二值化，代码如下

#这部分代码承接split_image_text函数
if mode == 2:
	text = []
	for x1, x2 in res:
	img = image.crop((x1, 0, x2, 28)).resize(image_shape)
	img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2GRAY)
	binary = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 25, 6)
	text.append(binary)
	#cv2格式转为Image格式
	#Image.fromarray(cv2.cvtColor(binary, cv2.COLOR_BGR2RGB))

效果如下
在这里插入图片描述

可以看到，二值化后有不少噪声，下面通过8邻域降噪的方法对上面二值化后的文本图片进行降噪。

#8邻域降噪也就是比较某一点的像素值与其周围8个点的像素值，看相等的点有多少个
#N为阈值，小于N，则该点视为噪声，将其设为白色
#z是降噪次数
#image是二值化图
#参考：https://www.cnblogs.com/xuchunlin/p/9234455.html
def clearNosie(image, N, z):
    rows, cols = image.shape
    for col in range(cols):
        image[0, col] = 255
        image[rows-1, col] = 255
    for i in range(z):
        for x in range(1, rows-1):
            for y in range(1, cols-1):
                nearDots = 0
                L = image[x, y]
                if L == image[x-1, y-1]:
                    nearDots += 1
                if L == image[x-1, y]:
                    nearDots += 1
                if L == image[x-1, y+1]:
                    nearDots += 1
                if L == image[x, y-1]:
                    nearDots += 1
                if L == image[x, y+1]:
                    nearDots += 1
                if L == image[x+1, y-1]:
                    nearDots += 1
                if L == image[x+1, y]:
                    nearDots += 1
                if L == image[x+1, y+1]:
                    nearDots += 1
                if nearDots < N:
                    image[x, y] = 255
    return image

当N = 4，z = 2，效果如下
在这里插入图片描述

第二种，先对灰底文本图片进行直方图均衡，进行图像增强，然后再将其二值化，代码如下

import cv2
img = cv2.imread('D:\\python\\ant.jpg')
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#自适应直方图均衡
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
cl1 = clahe.apply(gray_img)
#大于100灰度值的像素置为255，低于100灰度值为0。经过尝试，100阈值的效果好一点
ret, binary = cv2.threshold(cl1, 100, 255,cv2.THRESH_BINARY)