# 赛题

A. 数据规模和内容覆盖

B.数据内容示例：

json 文件内容规范：

{

“image1”: “陆万捌千零贰拾伍元整”,

“image2”: “付经管院工资”,

“image3”: “”,

}

# 具体方案

## 构建数据集

import os
import json
#官方给的数据集
image_path_amount = "./data/train/amount/images"
image_path_date = "./data/train/date/images"
#增强数据集
image_path_test='./data/gan_test_15000/images/0'
image_path_train='./data/gan_train_15500_0/images/0'
amount_list = os.listdir(image_path_amount)
amount_list = os.listdir(image_path_amount)

new_amount_list = []
for filename in amount_list:
new_amount_list.append(image_path_amount + "/" + filename)

date_list = os.listdir(image_path_date)
new_date_list = []
for filename in date_list:
new_date_list.append(image_path_date + "/" + filename)
new_test_list = []
for filename in amount_list:
new_test_list.append(image_path_amount + "/" + filename)

new_train_list = []
for filename in amount_list:
new_train_list.append(image_path_amount + "/" + filename)

image_path_amount和image_path_date是官方给定的数据集路径。

image_path_test和image_path_train是增强的数据集（在后面会讲如何做增强）

amount_json = "./data/train/amount/gt.json"
date_json = "./data/train/date/gt.json"
train_json = "train_data.json"
test_json = "test_data.json"
with open(amount_json, "r", encoding='utf-8') as f:
with open(date_json, "r", encoding='utf-8') as f:
with open(train_json, "r", encoding='utf-8') as f:
with open(test_json, "r", encoding='utf-8') as f:

#聚合list
all_list = new_amount_list + new_date_list+new_test_list+new_train_list
from sklearn.model_selection import train_test_split
#切分训练集合和验证集
train_list, test_list = train_test_split(all_list, test_size=0.15, random_state=42)
#聚合字典
all_dic = {}
with open('train.txt', 'w') as f:
for line in train_list:
f.write(line + " " + all_dic[line.split('/')[-1]]+"n")
with open('val.txt', 'w') as f:
for line in test_list:
f.write(line + " " + all_dic[line.split('/')[-1]]+"n")

# 获取class

import json

amount_json = "./data/train/amount/gt.json"
date_json = "./data/train/date/gt.json"
with open(amount_json, "r", encoding='utf-8') as f:
with open(date_json, "r", encoding='utf-8') as f:
all_dic = {}
list_key=[]
for keyline in all_dic.values():
for key in keyline:
if key not in list_key:
list_key.append(key)
with open('data/char_std_5990.txt', 'w') as f:
for line in list_key:
f.write(line+"n")

# 改进模型

crnn的卷积部分类似VGG，我对模型的改进主要有一下几个方面：

1、加入激活函数Swish。

2、加入BatchNorm。

3、加入SE注意力机制。

4、适当加深模型。

self.cnn = nn.Sequential(
nn.Conv2d(nc, 64, 3, 1, 1), Swish(), nn.BatchNorm2d(64),
nn.MaxPool2d(2, 2),  # 64x16x50
nn.Conv2d(64, 128, 3, 1, 1), Swish(), nn.BatchNorm2d(128),
nn.MaxPool2d(2, 2),  # 128x8x25
nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256), Swish(),  # 256x8x25
nn.Conv2d(256, 256, 3, 1, 1), nn.BatchNorm2d(256), Swish(),  # 256x8x25
SELayer(256, 16),
nn.MaxPool2d((2, 2), (2, 1), (0, 1)),  # 256x4x25
nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512), Swish(),  # 512x4x25
nn.Conv2d(512, 512, 1), nn.BatchNorm2d(512), Swish(),
nn.Conv2d(512, 512, 3, 1, 1), nn.BatchNorm2d(512), Swish(),  # 512x4x25
SELayer(512, 16),
nn.MaxPool2d((2, 2), (2, 1), (0, 1)),  # 512x2x25
nn.Conv2d(512, 512, 2, 1, 0), nn.BatchNorm2d(512), Swish())  # 512x1x25

SE和Swish

class SELayer(nn.Module):
def __init__(self, channel, reduction=16):
super(SELayer, self).__init__()
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=True),
nn.LeakyReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=True),
nn.Sigmoid()
)

def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)

class Swish(nn.Module):
def forward(self, x):
return x * torch.sigmoid(x)

## 训练

parser = argparse.ArgumentParser()
parser.add_argument('--batchSize', type=int, default=4, help='input batch size')
parser.add_argument('--imgH', type=int, default=32, help='the height of the input image to network')
parser.add_argument('--imgW', type=int, default=512, help='the width of the input image to network')
parser.add_argument('--nh', type=int, default=512, help='size of the lstm hidden state')
parser.add_argument('--niter', type=int, default=300, help='number of epochs to train for')
parser.add_argument('--lr', type=float, default=0.00005, help='learning rate for Critic, default=0.00005')
parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
parser.add_argument('--encoder', type=str, default='', help="path to encoder (to continue training)")
parser.add_argument('--decoder', type=str, default='', help='path to decoder (to continue training)')
parser.add_argument('--experiment', default='./expr/attentioncnn', help='Where to store samples and models')
parser.add_argument('--displayInterval', type=int, default=100, help='Interval to be displayed')
parser.add_argument('--valInterval', type=int, default=1, help='Interval to be displayed')
parser.add_argument('--saveInterval', type=int, default=1, help='Interval to be displayed')
parser.add_argument('--keep_ratio',default=True, action='store_true', help='whether to keep ratio for image resize')
parser.add_argument('--random_sample', default=True, action='store_true', help='whether to sample the dataset with random sampler')
parser.add_argument('--teaching_forcing_prob', type=float, default=0.5, help='where to use teach forcing')
parser.add_argument('--max_width', type=int, default=129, help='the width of the featuremap out from cnn')
opt = parser.parse_args()

trainlist：训练集，默认是train.txt。

vallist：验证集路径，默认是val.txt。

batchSize：批大小，根据显存大小设置。

imgH：图片的高度，crnn模型默认为32，这里不需要修改。

imgW：图片宽度，我在这里设置为512。

keep_ratio：设置为True，设置为True后，程序会保持图片的比率，然后在一个batch内统一尺寸，这样训练的模型精度更高。

lr：学习率，设置为0.00005，这里要注意，不要太大，否则不收敛。

## 推理

import os
import json

image_path_amount = "./data/train/amount/images"
image_path_date = "./data/train/date/images"
amount_list = os.listdir(image_path_amount)
new_amount_list = []
for filename in amount_list:
new_amount_list.append(image_path_amount + "/" + filename)
date_list = os.listdir(image_path_date)
new_date_list = []
for filename in date_list:
new_date_list.append(image_path_date + "/" + filename)
amount_json = "./data/train/amount/gt.json"
date_json = "./data/train/date/gt.json"
with open(amount_json, "r", encoding='utf-8') as f:
with open(date_json, "r", encoding='utf-8') as f:
all_list = new_amount_list + new_date_list
from sklearn.model_selection import train_test_split

all_dic = {}

maxLen = 0
for i in all_dic.values():
if (len(i) > maxLen):
maxLen = len(i)
print(maxLen)

encoder_path = './expr/attentioncnn/encoder_22.pth'
decoder_path = './expr/attentioncnn/decoder_22.pth'

for path in tqdm(glob.glob('./data/测试集/date/images/*.jpg')):
text, prob = test(path)
if prob<0.8:
count+=1
result_dict[os.path.basename(path)] = {
'result': text,
'confidence': prob
}

for path in tqdm(glob.glob('./data/测试集/amount/images/*.jpg')):
text, prob = test(path)
if prob<0.8:
count+=1
result_dict[os.path.basename(path)] = {
'result': text,
'confidence': prob
}

# 数据增强

## 一、工具简介

Style-Text数据合成工具是基于百度和华科合作研发的文本编辑算法《Editing Text in the Wild》https://arxiv.org/abs/1908.03047

## 二、环境配置

2. 进入StyleText目录，下载模型，并解压：
cd StyleText
unzip style_text_models.zip

bg_generator:
pretrain: style_text_models/bg_generator
...
text_generator:
pretrain: style_text_models/text_generator
...
fusion_generator:
pretrain: style_text_models/fusion_generator

## 合成单张图

python3 tools/synth_image.py -c configs/config.yml --style_image examples/style_images/2.jpg --text_corpus PaddleOCR --language en
• 注1：语言选项和语料相对应，目前支持英文(en)、简体中文(ch)和韩语(ko)。
如果输入图像尺寸相差过多，效果可能不佳。
• 注3：可以通过修改配置文件configs/config.yml中的use_gpu(true或者false)参数来决定是否使用GPU进行预测。

fake_text.jpg：是用提供的字符串，仿照风格参考图中文字的风格，生成在灰色背景上的文字图片。

### 批量合成

1. configs/dataset_config.yml中配置目标场景风格图像和语料的路径，具体如下：

• Global

• output_dir:：保存合成数据的目录。
• StyleSampler

• image_home：风格图片目录；
• label_file：风格图片路径列表文件，如果所用数据集有label，则label_file为label文件路径；
• with_label：标志label_file是否为label文件。
• CorpusGenerator

• method：语料生成方法，目前有FileCorpusEnNumCorpus可选。如果使用EnNumCorpus，则不需要填写其他配置，否则需要修改corpus_filelanguage
• language：语料的语种，目前支持英文(en)、简体中文(ch)和韩语(ko)；
• corpus_file: 语料文件路径。语料文件应使用文本文件。语料生成器首先会将语料按行切分，之后每次随机选取一行。