parseq_decoder.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. # Scene Text Recognition Model Hub
  2. # Copyright 2022 Darwin Bautista
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # https://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import math
  16. from itertools import permutations
  17. from typing import Any, Optional
  18. import numpy as np
  19. import torch
  20. import torch.nn as nn
  21. import torch.nn.functional as F
  22. from torch import Tensor
  23. from torch.nn.modules import transformer
  24. class DecoderLayer(nn.Module):
  25. """A Transformer decoder layer supporting two-stream attention (XLNet) This
  26. implements a pre-LN decoder, as opposed to the post-LN default in
  27. PyTorch."""
  28. def __init__(
  29. self,
  30. d_model,
  31. nhead,
  32. dim_feedforward=2048,
  33. dropout=0.1,
  34. activation='gelu',
  35. layer_norm_eps=1e-5,
  36. ):
  37. super().__init__()
  38. self.self_attn = nn.MultiheadAttention(d_model,
  39. nhead,
  40. dropout=dropout,
  41. batch_first=True)
  42. self.cross_attn = nn.MultiheadAttention(d_model,
  43. nhead,
  44. dropout=dropout,
  45. batch_first=True)
  46. # Implementation of Feedforward model
  47. self.linear1 = nn.Linear(d_model, dim_feedforward)
  48. self.dropout = nn.Dropout(dropout)
  49. self.linear2 = nn.Linear(dim_feedforward, d_model)
  50. self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
  51. self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
  52. self.norm_q = nn.LayerNorm(d_model, eps=layer_norm_eps)
  53. self.norm_c = nn.LayerNorm(d_model, eps=layer_norm_eps)
  54. self.dropout1 = nn.Dropout(dropout)
  55. self.dropout2 = nn.Dropout(dropout)
  56. self.dropout3 = nn.Dropout(dropout)
  57. self.activation = transformer._get_activation_fn(activation)
  58. def __setstate__(self, state):
  59. if 'activation' not in state:
  60. state['activation'] = F.gelu
  61. super().__setstate__(state)
  62. def forward_stream(
  63. self,
  64. tgt: Tensor,
  65. tgt_norm: Tensor,
  66. tgt_kv: Tensor,
  67. memory: Tensor,
  68. tgt_mask: Optional[Tensor],
  69. tgt_key_padding_mask: Optional[Tensor],
  70. ):
  71. """Forward pass for a single stream (i.e. content or query) tgt_norm is
  72. just a LayerNorm'd tgt.
  73. Added as a separate parameter for efficiency. Both tgt_kv and memory
  74. are expected to be LayerNorm'd too. memory is LayerNorm'd by ViT.
  75. """
  76. tgt2, sa_weights = self.self_attn(
  77. tgt_norm,
  78. tgt_kv,
  79. tgt_kv,
  80. attn_mask=tgt_mask,
  81. key_padding_mask=tgt_key_padding_mask)
  82. tgt = tgt + self.dropout1(tgt2)
  83. tgt2, ca_weights = self.cross_attn(self.norm1(tgt), memory, memory)
  84. self.attn_map = ca_weights
  85. tgt = tgt + self.dropout2(tgt2)
  86. tgt2 = self.linear2(
  87. self.dropout(self.activation(self.linear1(self.norm2(tgt)))))
  88. tgt = tgt + self.dropout3(tgt2)
  89. return tgt, sa_weights, ca_weights
  90. def forward(
  91. self,
  92. query,
  93. content,
  94. memory,
  95. query_mask: Optional[Tensor] = None,
  96. content_mask: Optional[Tensor] = None,
  97. content_key_padding_mask: Optional[Tensor] = None,
  98. update_content: bool = True,
  99. ):
  100. query_norm = self.norm_q(query)
  101. content_norm = self.norm_c(content)
  102. query = self.forward_stream(query, query_norm, content_norm, memory,
  103. query_mask, content_key_padding_mask)[0]
  104. if update_content:
  105. content = self.forward_stream(content, content_norm, content_norm,
  106. memory, content_mask,
  107. content_key_padding_mask)[0]
  108. return query, content
  109. class Decoder(nn.Module):
  110. __constants__ = ['norm']
  111. def __init__(self, decoder_layer, num_layers, norm):
  112. super().__init__()
  113. self.layers = transformer._get_clones(decoder_layer, num_layers)
  114. self.num_layers = num_layers
  115. self.norm = norm
  116. def forward(
  117. self,
  118. query,
  119. content,
  120. memory,
  121. query_mask: Optional[Tensor] = None,
  122. content_mask: Optional[Tensor] = None,
  123. content_key_padding_mask: Optional[Tensor] = None,
  124. ):
  125. for i, mod in enumerate(self.layers):
  126. last = i == len(self.layers) - 1
  127. query, content = mod(
  128. query,
  129. content,
  130. memory,
  131. query_mask,
  132. content_mask,
  133. content_key_padding_mask,
  134. update_content=not last,
  135. )
  136. query = self.norm(query)
  137. return query
  138. class TokenEmbedding(nn.Module):
  139. def __init__(self, charset_size: int, embed_dim: int):
  140. super().__init__()
  141. self.embedding = nn.Embedding(charset_size, embed_dim)
  142. self.embed_dim = embed_dim
  143. def forward(self, tokens: torch.Tensor):
  144. return math.sqrt(self.embed_dim) * self.embedding(tokens)
  145. class PARSeqDecoder(nn.Module):
  146. def __init__(self,
  147. in_channels,
  148. out_channels,
  149. max_label_length=25,
  150. embed_dim=384,
  151. dec_num_heads=12,
  152. dec_mlp_ratio=4,
  153. dec_depth=1,
  154. perm_num=6,
  155. perm_forward=True,
  156. perm_mirrored=True,
  157. decode_ar=True,
  158. refine_iters=1,
  159. dropout=0.1,
  160. **kwargs: Any) -> None:
  161. super().__init__()
  162. self.pad_id = out_channels - 1
  163. self.eos_id = 0
  164. self.bos_id = out_channels - 2
  165. self.max_label_length = max_label_length
  166. self.decode_ar = decode_ar
  167. self.refine_iters = refine_iters
  168. decoder_layer = DecoderLayer(embed_dim, dec_num_heads,
  169. embed_dim * dec_mlp_ratio, dropout)
  170. self.decoder = Decoder(decoder_layer,
  171. num_layers=dec_depth,
  172. norm=nn.LayerNorm(embed_dim))
  173. # Perm/attn mask stuff
  174. self.rng = np.random.default_rng()
  175. self.max_gen_perms = perm_num // 2 if perm_mirrored else perm_num
  176. self.perm_forward = perm_forward
  177. self.perm_mirrored = perm_mirrored
  178. # We don't predict <bos> nor <pad>
  179. self.head = nn.Linear(embed_dim, out_channels - 2)
  180. self.text_embed = TokenEmbedding(out_channels, embed_dim)
  181. # +1 for <eos>
  182. self.pos_queries = nn.Parameter(
  183. torch.Tensor(1, max_label_length + 1, embed_dim))
  184. self.dropout = nn.Dropout(p=dropout)
  185. # Encoder has its own init.
  186. self.apply(self._init_weights)
  187. nn.init.trunc_normal_(self.pos_queries, std=0.02)
  188. def _init_weights(self, module: nn.Module):
  189. """Initialize the weights using the typical initialization schemes used
  190. in SOTA models."""
  191. if isinstance(module, nn.Linear):
  192. nn.init.trunc_normal_(module.weight, std=0.02)
  193. if module.bias is not None:
  194. nn.init.zeros_(module.bias)
  195. elif isinstance(module, nn.Embedding):
  196. nn.init.trunc_normal_(module.weight, std=0.02)
  197. if module.padding_idx is not None:
  198. module.weight.data[module.padding_idx].zero_()
  199. elif isinstance(module, nn.Conv2d):
  200. nn.init.kaiming_normal_(module.weight,
  201. mode='fan_out',
  202. nonlinearity='relu')
  203. if module.bias is not None:
  204. nn.init.zeros_(module.bias)
  205. elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
  206. nn.init.ones_(module.weight)
  207. nn.init.zeros_(module.bias)
  208. @torch.jit.ignore
  209. def no_weight_decay(self):
  210. param_names = {'text_embed.embedding.weight', 'pos_queries'}
  211. return param_names
  212. def decode(
  213. self,
  214. tgt: torch.Tensor,
  215. memory: torch.Tensor,
  216. tgt_mask: Optional[Tensor] = None,
  217. tgt_padding_mask: Optional[Tensor] = None,
  218. tgt_query: Optional[Tensor] = None,
  219. tgt_query_mask: Optional[Tensor] = None,
  220. pos_query: torch.Tensor = None,
  221. ):
  222. N, L = tgt.shape
  223. # <bos> stands for the null context. We only supply position information for characters after <bos>.
  224. null_ctx = self.text_embed(tgt[:, :1])
  225. if tgt_query is None:
  226. tgt_query = pos_query[:, :L]
  227. tgt_emb = pos_query[:, :L - 1] + self.text_embed(tgt[:, 1:])
  228. tgt_emb = self.dropout(torch.cat([null_ctx, tgt_emb], dim=1))
  229. tgt_query = self.dropout(tgt_query)
  230. return self.decoder(tgt_query, tgt_emb, memory, tgt_query_mask,
  231. tgt_mask, tgt_padding_mask)
  232. def forward(self, x, data=None, pos_query=None):
  233. if self.training:
  234. return self.training_step([x, pos_query, data[0]])
  235. else:
  236. return self.forward_test(x, pos_query)
  237. def forward_test(self,
  238. memory: Tensor,
  239. pos_query: Tensor = None,
  240. max_length: Optional[int] = None) -> Tensor:
  241. _device = memory.get_device()
  242. testing = max_length is None
  243. max_length = (self.max_label_length if max_length is None else min(
  244. max_length, self.max_label_length))
  245. bs = memory.shape[0]
  246. # +1 for <eos> at end of sequence.
  247. num_steps = max_length + 1
  248. # memory = self.encode(images)
  249. # Query positions up to `num_steps`
  250. if pos_query is None:
  251. pos_queries = self.pos_queries[:, :num_steps].expand(bs, -1, -1)
  252. else:
  253. pos_queries = pos_query
  254. # Special case for the forward permutation. Faster than using `generate_attn_masks()`
  255. tgt_mask = query_mask = torch.triu(
  256. torch.full((num_steps, num_steps), float('-inf'), device=_device),
  257. 1)
  258. self.attn_maps = []
  259. if self.decode_ar:
  260. tgt_in = torch.full((bs, num_steps),
  261. self.pad_id,
  262. dtype=torch.long,
  263. device=_device)
  264. tgt_in[:, 0] = self.bos_id
  265. logits = []
  266. for i in range(num_steps):
  267. j = i + 1 # next token index
  268. # Efficient decoding:
  269. # Input the context up to the ith token. We use only one query (at position = i) at a time.
  270. # This works because of the lookahead masking effect of the canonical (forward) AR context.
  271. # Past tokens have no access to future tokens, hence are fixed once computed.
  272. tgt_out = self.decode(
  273. tgt_in[:, :j],
  274. memory,
  275. tgt_mask[:j, :j],
  276. tgt_query=pos_queries[:, i:j],
  277. tgt_query_mask=query_mask[i:j, :j],
  278. pos_query=pos_queries,
  279. )
  280. self.attn_maps.append(self.decoder.layers[-1].attn_map)
  281. # the next token probability is in the output's ith token position
  282. p_i = self.head(tgt_out)
  283. logits.append(p_i)
  284. if j < num_steps:
  285. # greedy decode. add the next token index to the target input
  286. tgt_in[:, j] = p_i.squeeze().argmax(-1)
  287. # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
  288. if testing and (tgt_in == self.eos_id).any(dim=-1).all():
  289. break
  290. logits = torch.cat(logits, dim=1)
  291. else:
  292. # No prior context, so input is just <bos>. We query all positions.
  293. tgt_in = torch.full((bs, 1),
  294. self.bos_id,
  295. dtype=torch.long,
  296. device=_device)
  297. tgt_out = self.decode(tgt_in,
  298. memory,
  299. tgt_query=pos_queries,
  300. pos_query=pos_queries)
  301. logits = self.head(tgt_out)
  302. if self.refine_iters:
  303. # For iterative refinement, we always use a 'cloze' mask.
  304. # We can derive it from the AR forward mask by unmasking the token context to the right.
  305. query_mask[torch.triu(
  306. torch.ones(num_steps,
  307. num_steps,
  308. dtype=torch.bool,
  309. device=_device), 2)] = 0
  310. bos = torch.full((bs, 1),
  311. self.bos_id,
  312. dtype=torch.long,
  313. device=_device)
  314. for i in range(self.refine_iters):
  315. # Prior context is the previous output.
  316. tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
  317. tgt_padding_mask = (tgt_in == self.eos_id).int().cumsum(
  318. -1) > 0 # mask tokens beyond the first EOS token.
  319. tgt_out = self.decode(
  320. tgt_in,
  321. memory,
  322. tgt_mask,
  323. tgt_padding_mask,
  324. tgt_query=pos_queries,
  325. tgt_query_mask=query_mask[:, :tgt_in.shape[1]],
  326. pos_query=pos_queries,
  327. )
  328. logits = self.head(tgt_out)
  329. return F.softmax(logits, -1)
  330. def gen_tgt_perms(self, tgt, _device):
  331. """Generate shared permutations for the whole batch.
  332. This works because the same attention mask can be used for the shorter
  333. sequences because of the padding mask.
  334. """
  335. # We don't permute the position of BOS, we permute EOS separately
  336. max_num_chars = tgt.shape[1] - 2
  337. # Special handling for 1-character sequences
  338. if max_num_chars == 1:
  339. return torch.arange(3, device=_device).unsqueeze(0)
  340. perms = [torch.arange(max_num_chars, device=_device)
  341. ] if self.perm_forward else []
  342. # Additional permutations if needed
  343. max_perms = math.factorial(max_num_chars)
  344. if self.perm_mirrored:
  345. max_perms //= 2
  346. num_gen_perms = min(self.max_gen_perms, max_perms)
  347. # For 4-char sequences and shorter, we generate all permutations and sample from the pool to avoid collisions
  348. # Note that this code path might NEVER get executed since the labels in a mini-batch typically exceed 4 chars.
  349. if max_num_chars < 5:
  350. # Pool of permutations to sample from. We only need the first half (if complementary option is selected)
  351. # Special handling for max_num_chars == 4 which correctly divides the pool into the flipped halves
  352. if max_num_chars == 4 and self.perm_mirrored:
  353. selector = [0, 3, 4, 6, 9, 10, 12, 16, 17, 18, 19, 21]
  354. else:
  355. selector = list(range(max_perms))
  356. perm_pool = torch.as_tensor(list(
  357. permutations(range(max_num_chars), max_num_chars)),
  358. device=_device)[selector]
  359. # If the forward permutation is always selected, no need to add it to the pool for sampling
  360. if self.perm_forward:
  361. perm_pool = perm_pool[1:]
  362. perms = torch.stack(perms)
  363. if len(perm_pool):
  364. i = self.rng.choice(len(perm_pool),
  365. size=num_gen_perms - len(perms),
  366. replace=False)
  367. perms = torch.cat([perms, perm_pool[i]])
  368. else:
  369. perms.extend([
  370. torch.randperm(max_num_chars, device=_device)
  371. for _ in range(num_gen_perms - len(perms))
  372. ])
  373. perms = torch.stack(perms)
  374. if self.perm_mirrored:
  375. # Add complementary pairs
  376. comp = perms.flip(-1)
  377. # Stack in such a way that the pairs are next to each other.
  378. perms = torch.stack([perms, comp
  379. ]).transpose(0, 1).reshape(-1, max_num_chars)
  380. # NOTE:
  381. # The only meaningful way of permuting the EOS position is by moving it one character position at a time.
  382. # However, since the number of permutations = T! and number of EOS positions = T + 1, the number of possible EOS
  383. # positions will always be much less than the number of permutations (unless a low perm_num is set).
  384. # Thus, it would be simpler to just train EOS using the full and null contexts rather than trying to evenly
  385. # distribute it across the chosen number of permutations.
  386. # Add position indices of BOS and EOS
  387. bos_idx = perms.new_zeros((len(perms), 1))
  388. eos_idx = perms.new_full((len(perms), 1), max_num_chars + 1)
  389. perms = torch.cat([bos_idx, perms + 1, eos_idx], dim=1)
  390. # Special handling for the reverse direction. This does two things:
  391. # 1. Reverse context for the characters
  392. # 2. Null context for [EOS] (required for learning to predict [EOS] in NAR mode)
  393. if len(perms) > 1:
  394. perms[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1,
  395. device=_device)
  396. return perms
  397. def generate_attn_masks(self, perm, _device):
  398. """Generate attention masks given a sequence permutation (includes pos.
  399. for bos and eos tokens)
  400. :param perm: the permutation sequence. i = 0 is always the BOS
  401. :return: lookahead attention masks
  402. """
  403. sz = perm.shape[0]
  404. mask = torch.zeros((sz, sz), device=_device)
  405. for i in range(sz):
  406. query_idx = perm[i]
  407. masked_keys = perm[i + 1:]
  408. mask[query_idx, masked_keys] = float('-inf')
  409. content_mask = mask[:-1, :-1].clone()
  410. mask[torch.eye(sz, dtype=torch.bool,
  411. device=_device)] = float('-inf') # mask "self"
  412. query_mask = mask[1:, :-1]
  413. return content_mask, query_mask
  414. def training_step(self, batch):
  415. memory, pos_query, tgt = batch
  416. bs = memory.shape[0]
  417. if pos_query is None:
  418. pos_query = self.pos_queries.expand(bs, -1, -1)
  419. # Prepare the target sequences (input and output)
  420. tgt_perms = self.gen_tgt_perms(tgt, memory.get_device())
  421. tgt_in = tgt[:, :-1]
  422. tgt_out = tgt[:, 1:]
  423. # The [EOS] token is not depended upon by any other token in any permutation ordering
  424. tgt_padding_mask = (tgt_in == self.pad_id) | (tgt_in == self.eos_id)
  425. loss = 0
  426. loss_numel = 0
  427. n = (tgt_out != self.pad_id).sum().item()
  428. for i, perm in enumerate(tgt_perms):
  429. tgt_mask, query_mask = self.generate_attn_masks(
  430. perm, memory.get_device())
  431. out = self.decode(
  432. tgt_in,
  433. memory,
  434. tgt_mask,
  435. tgt_padding_mask,
  436. tgt_query_mask=query_mask,
  437. pos_query=pos_query,
  438. )
  439. logits = self.head(out)
  440. if i == 0:
  441. final_out = logits
  442. loss += n * F.cross_entropy(logits.flatten(end_dim=1),
  443. tgt_out.flatten(),
  444. ignore_index=self.pad_id)
  445. loss_numel += n
  446. # After the second iteration (i.e. done with canonical and reverse orderings),
  447. # remove the [EOS] tokens for the succeeding perms
  448. if i == 1:
  449. tgt_out = torch.where(tgt_out == self.eos_id, self.pad_id,
  450. tgt_out)
  451. n = (tgt_out != self.pad_id).sum().item()
  452. loss /= loss_numel
  453. # self.log('loss', loss)
  454. return [loss, final_out]