diff --git a/dataset_zoo/sroie/metafile.yml b/dataset_zoo/sroie/metafile.yml new file mode 100644 index 000000000..804530eb3 --- /dev/null +++ b/dataset_zoo/sroie/metafile.yml @@ -0,0 +1,31 @@ +Name: 'Scanned Receipts OCR and Information Extraction' +Paper: + Title: ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction + URL: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8977955 + Venue: ICDAR + Year: '2019' + BibTeX: '@INPROCEEDINGS{8977955, + author={Huang, Zheng and Chen, Kai and He, Jianhua and Bai, Xiang and Karatzas, Dimosthenis and Lu, Shijian and Jawahar, C. V.}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + title={ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction}, + year={2019}, + volume={}, + number={}, + pages={1516-1520}, + doi={10.1109/ICDAR.2019.00244}}' +Data: + Website: https://rrc.cvc.uab.es/?ch=13 + Language: + - English + Scene: + - Document + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .txt diff --git a/dataset_zoo/sroie/sample_anno.md b/dataset_zoo/sroie/sample_anno.md new file mode 100644 index 000000000..86efab568 --- /dev/null +++ b/dataset_zoo/sroie/sample_anno.md @@ -0,0 +1,9 @@ +**Text Detection, Text Recognition and Text Spotting** + +```text +# x1,y1,x2,y2,x3,y3,x4,y4,trans + +72,25,326,25,326,64,72,64,TAN WOON YANN +50,82,440,82,440,121,50,121,BOOK TA .K(TAMAN DAYA) SDN BND +205,121,285,121,285,139,205,139,789417-W +``` diff --git a/dataset_zoo/sroie/textdet.py b/dataset_zoo/sroie/textdet.py new file mode 100644 index 000000000..78bb399a4 --- /dev/null +++ b/dataset_zoo/sroie/textdet.py @@ -0,0 +1,55 @@ +data_root = 'data/sroie' +cache_path = 'data/cache' + +data_obtainer = dict( + type='NaiveDataObtainer', + cache_path=cache_path, + data_root=data_root, + files=[ + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'sroie/0325updated.task1train(626p).zip', + save_name='0325updated.task1train(626p).zip', + md5='16137490f6865caac75772b9111d348c', + split=['train'], + content=['image', 'annotation'], + mapping=[[ + '0325updated/0325updated.task1train(626p)/*.jpg', + 'textdet_imgs/train' + ], + [ + '0325updated/0325updated.task1train(626p)/*.txt', + 'annotations/train' + ]]), + dict( + url='https://download.openmmlab.com/mmocr/data/' + 'sroie/task1&2_test(361p).zip', + save_name='task1&2_test(361p).zip', + md5='1bde54705db0995c57a6e34cce437fea', + split=['test'], + content=['image'], + mapping=[[ + 'task1&2_test(361p)/fulltext_test(361p)', 'textdet_imgs/test' + ]]), + dict( + url='https://download.openmmlab.com/mmocr/data/sroie/text.zip', + save_name='text.zip', + md5='8c534653f252ff4d3943fa27a956a74b', + split=['test'], + content=['annotation'], + mapping=[['text', 'annotations/test']]), + ]) + +data_converter = dict( + type='TextDetDataConverter', + splits=['train', 'test'], + data_root=data_root, + gatherer=dict( + type='pair_gather', + suffixes=['.jpg'], + rule=[r'X(\d+)\.([jJ][pP][gG])', r'X\1.txt']), + parser=dict(type='SROIETextDetAnnParser', encoding='utf-8-sig'), + dumper=dict(type='JsonDumper'), + delete=['text', 'task1&2_test(361p)', '0325updated', 'annotations']) + +config_generator = dict(type='TextDetConfigGenerator', data_root=data_root) diff --git a/dataset_zoo/sroie/textrecog.py b/dataset_zoo/sroie/textrecog.py new file mode 100644 index 000000000..212c7e7d1 --- /dev/null +++ b/dataset_zoo/sroie/textrecog.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextRecogCropConverter') + +config_generator = dict(type='TextRecogConfigGenerator') diff --git a/dataset_zoo/sroie/textspotting.py b/dataset_zoo/sroie/textspotting.py new file mode 100644 index 000000000..88486337b --- /dev/null +++ b/dataset_zoo/sroie/textspotting.py @@ -0,0 +1,5 @@ +_base_ = ['textdet.py'] + +data_converter = dict(type='TextSpottingDataConverter') + +config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/mmocr/datasets/preparers/data_converter.py b/mmocr/datasets/preparers/data_converter.py index a1d0b1f6b..fc7177e6c 100644 --- a/mmocr/datasets/preparers/data_converter.py +++ b/mmocr/datasets/preparers/data_converter.py @@ -177,6 +177,8 @@ def pair_gather(self, img_path: str, suffixes: List, rule: Sequence, """ files = list() for file in list_files(img_path, suffixes): + if not re.match(rule[0], osp.basename(file)): + continue file2 = re.sub(rule[0], rule[1], osp.basename(file)) file2 = file.replace(osp.basename(file), file2) file2 = file2.replace(self.img_dir, 'annotations') diff --git a/mmocr/datasets/preparers/parsers/__init__.py b/mmocr/datasets/preparers/parsers/__init__.py index cdd08de1f..d5d95cca8 100644 --- a/mmocr/datasets/preparers/parsers/__init__.py +++ b/mmocr/datasets/preparers/parsers/__init__.py @@ -4,6 +4,7 @@ from .icdar_txt_parser import (ICDARTxtTextDetAnnParser, ICDARTxtTextRecogAnnParser) from .naf_parser import NAFAnnParser +from .sroie_parser import SROIETextDetAnnParser from .svt_parser import SVTTextDetAnnParser from .totaltext_parser import TotaltextTextDetAnnParser from .wildreceipt_parser import WildreceiptKIEAnnParser @@ -12,5 +13,5 @@ 'ICDARTxtTextDetAnnParser', 'ICDARTxtTextRecogAnnParser', 'TotaltextTextDetAnnParser', 'WildreceiptKIEAnnParser', 'COCOTextDetAnnParser', 'SVTTextDetAnnParser', 'FUNSDTextDetAnnParser', - 'NAFAnnParser' + 'SROIETextDetAnnParser', 'NAFAnnParser' ] diff --git a/mmocr/datasets/preparers/parsers/sroie_parser.py b/mmocr/datasets/preparers/parsers/sroie_parser.py new file mode 100644 index 000000000..9f97ad437 --- /dev/null +++ b/mmocr/datasets/preparers/parsers/sroie_parser.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +from mmocr.utils import bbox2poly +from ..data_preparer import DATA_PARSERS +from .base import BaseParser + + +@DATA_PARSERS.register_module() +class SROIETextDetAnnParser(BaseParser): + """SROIE Txt Format Text Detection Annotation Parser. + + The original annotation format of this dataset is stored in txt files, + which is formed as the following format: + x1, y1, x2, y2, x3, y3, x4, y4, transcription + + Args: + separator (str): The separator between each element in a line. Defaults + to ','. + ignore (str): The text to be ignored. Defaults to '###'. + format (str): The format of the annotation. Defaults to + 'x1,y1,x2,y2,x3,y3,x4,trans'. + encoding (str): The encoding of the annotation file. Defaults to + 'utf-8-sig'. + nproc (int): The number of processes to parse the annotation. Defaults + to 1. + remove_strs (List[str], Optional): Used to remove redundant strings in + the transcription. Defaults to None. + mode (str, optional): The mode of the box converter. Supported modes + are 'xywh' and 'xyxy'. Defaults to None. + """ + + def __init__(self, + separator: str = ',', + ignore: str = '###', + format: str = 'x1,y1,x2,y2,x3,y3,x4,y4,trans', + encoding: str = 'utf-8-sig', + nproc: int = 1, + remove_strs: Optional[List[str]] = None, + mode: str = None) -> None: + self.sep = separator + self.format = format + self.encoding = encoding + self.ignore = ignore + self.mode = mode + self.remove_strs = remove_strs + super().__init__(nproc=nproc) + + def parse_file(self, file: Tuple, split: str) -> Tuple: + """Parse single annotation.""" + img_file, txt_file = file + instances = list() + try: + # there might be some illegal symbols in the annotation + # which cannot be parsed by loader + for anno in self.loader(txt_file, self.sep, self.format, + self.encoding): + anno = list(anno.values()) + if self.remove_strs is not None: + for strs in self.remove_strs: + for i in range(len(anno)): + if strs in anno[i]: + anno[i] = anno[i].replace(strs, '') + poly = list(map(float, anno[0:-1])) + if self.mode is not None: + poly = bbox2poly(poly, self.mode) + poly = poly.tolist() + text = anno[-1] + instances.append( + dict(poly=poly, text=text, ignore=text == self.ignore)) + except Exception: + pass + + return img_file, instances