svtrv2_lnconv_nrtr_gtc.yml 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. Global:
  2. device: gpu
  3. epoch_num: 20
  4. log_smooth_window: 20
  5. print_batch_step: 10
  6. output_dir: ./output/rec/svtrv2_lnconv_nrtr_gtc
  7. save_epoch_step: [15, 1]
  8. # evaluation is run every 2000 iterations
  9. eval_batch_step: [0, 500]
  10. eval_epoch_step: [0, 1]
  11. cal_metric_during_train: True
  12. pretrained_model:
  13. checkpoints:
  14. use_tensorboard: false
  15. infer_img: ../ltb/img
  16. # for data or label process
  17. character_dict_path: &character_dict_path ./tools/utils/EN_symbol_dict.txt # 96en
  18. # ./tools/utils/ppocr_keys_v1.txt # ch
  19. max_text_length: &max_text_length 25
  20. use_space_char: &use_space_char False
  21. save_res_path: ./output/rec/predicts_smtr.txt
  22. use_amp: True
  23. distributed: true
  24. Optimizer:
  25. name: AdamW
  26. lr: 0.00065
  27. weight_decay: 0.05
  28. filter_bias_and_bn: True
  29. LRScheduler:
  30. name: OneCycleLR
  31. warmup_epoch: 1.5 # pct_start 0.075*20 = 1.5ep
  32. cycle_momentum: False
  33. Architecture:
  34. model_type: rec
  35. algorithm: BGPD
  36. in_channels: 3
  37. Transform:
  38. Encoder:
  39. name: SVTRv2LNConvTwo33
  40. use_pos_embed: False
  41. out_channels: 256
  42. dims: [128, 256, 384]
  43. depths: [6, 6, 6]
  44. num_heads: [4, 8, 12]
  45. mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','FGlobal','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']]
  46. local_k: [[5, 5], [5, 5], [-1, -1]]
  47. sub_k: [[1, 1], [2, 1], [-1, -1]]
  48. last_stage: false
  49. feat2d: True
  50. Decoder:
  51. name: GTCDecoder
  52. infer_gtc: True
  53. detach: False
  54. gtc_decoder:
  55. name: NRTRDecoder
  56. num_encoder_layers: -1
  57. beam_size: 0
  58. num_decoder_layers: 2
  59. nhead: 12
  60. max_len: *max_text_length
  61. ctc_decoder:
  62. name: RCTCDecoder
  63. Loss:
  64. name: GTCLoss
  65. gtc_loss:
  66. name: ARLoss
  67. PostProcess:
  68. name: GTCLabelDecode
  69. gtc_label_decode:
  70. name: ARLabelDecode
  71. character_dict_path: *character_dict_path
  72. use_space_char: *use_space_char
  73. Metric:
  74. name: RecGTCMetric
  75. main_indicator: acc
  76. is_filter: True
  77. Train:
  78. dataset:
  79. name: RatioDataSet
  80. ds_width: True
  81. # max_ratio: &max_ratio 4
  82. # min_ratio: 1
  83. # base_shape: &base_shape [[64, 64], [96, 48], [112, 40], [128, 32]]
  84. # base_h: &base_h 32
  85. # padding: &padding False
  86. padding: false
  87. # padding_rand: true
  88. # padding_doub: true
  89. data_dir_list: ['../Union14M-L-LMDB-Filtered/filter_train_challenging',
  90. '../Union14M-L-LMDB-Filtered/filter_train_hard',
  91. '../Union14M-L-LMDB-Filtered/filter_train_medium',
  92. '../Union14M-L-LMDB-Filtered/filter_train_normal',
  93. '../Union14M-L-LMDB-Filtered/filter_train_easy',
  94. ]
  95. transforms:
  96. - DecodeImage: # load image
  97. img_mode: BGR
  98. channel_first: False
  99. - PARSeqAug:
  100. - GTCLabelEncode: # Class handling label
  101. gtc_label_encode:
  102. name: ARLabelEncode
  103. character_dict_path: *character_dict_path
  104. use_space_char: *use_space_char
  105. max_text_length: *max_text_length
  106. - KeepKeys:
  107. keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
  108. sampler:
  109. name: RatioSampler
  110. scales: [[128, 32]] # w, h
  111. # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
  112. first_bs: &bs 256
  113. fix_bs: false
  114. divided_factor: [4, 16] # w, h
  115. is_training: True
  116. loader:
  117. shuffle: True
  118. batch_size_per_card: *bs
  119. drop_last: True
  120. max_ratio: &max_ratio 4
  121. num_workers: 4
  122. Eval:
  123. dataset:
  124. name: RatioDataSet
  125. ds_width: True
  126. padding: False
  127. data_dir_list: [
  128. '../evaluation/CUTE80',
  129. '../evaluation/IC13_857',
  130. '../evaluation/IC15_1811',
  131. '../evaluation/IIIT5k',
  132. '../evaluation/SVT',
  133. '../evaluation/SVTP',
  134. ]
  135. transforms:
  136. - DecodeImage: # load image
  137. img_mode: BGR
  138. channel_first: False
  139. - GTCLabelEncode: # Class handling label
  140. gtc_label_encode:
  141. name: ARLabelEncode
  142. character_dict_path: *character_dict_path
  143. use_space_char: *use_space_char
  144. max_text_length: *max_text_length
  145. - KeepKeys:
  146. keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length'] # dataloader will return list in this order
  147. sampler:
  148. name: RatioSampler
  149. scales: [[128, 32]] # w, h
  150. # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
  151. first_bs: *bs
  152. fix_bs: false
  153. divided_factor: [4, 16] # w, h
  154. is_training: False
  155. loader:
  156. shuffle: False
  157. drop_last: False
  158. batch_size_per_card: *bs
  159. max_ratio: *max_ratio
  160. num_workers: 4