rctc_decoder.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import torch
  2. import torch.nn as nn
  3. import torch.nn.functional as F
  4. from torch.nn.init import trunc_normal_
  5. from openrec.modeling.common import Block
  6. class RCTCDecoder(nn.Module):
  7. def __init__(self,
  8. in_channels,
  9. out_channels=6625,
  10. return_feats=False,
  11. **kwargs):
  12. super(RCTCDecoder, self).__init__()
  13. self.char_token = nn.Parameter(
  14. torch.zeros([1, 1, in_channels], dtype=torch.float32),
  15. requires_grad=True,
  16. )
  17. trunc_normal_(self.char_token, mean=0, std=0.02)
  18. self.fc = nn.Linear(
  19. in_channels,
  20. out_channels,
  21. bias=True,
  22. )
  23. self.fc_kv = nn.Linear(
  24. in_channels,
  25. 2 * in_channels,
  26. bias=True,
  27. )
  28. self.w_atten_block = Block(dim=in_channels,
  29. num_heads=in_channels // 32,
  30. mlp_ratio=4.0,
  31. qkv_bias=False)
  32. self.out_channels = out_channels
  33. self.return_feats = return_feats
  34. def forward(self, x, data=None):
  35. B, C, H, W = x.shape
  36. x = self.w_atten_block(x.permute(0, 2, 3,
  37. 1).reshape(-1, W, C)).reshape(
  38. B, H, W, C).permute(0, 3, 1, 2)
  39. # B, D, 8, 32
  40. x_kv = self.fc_kv(x.flatten(2).transpose(1, 2)).reshape(
  41. B, H * W, 2, C).permute(2, 0, 3, 1) # 2, b, c, hw
  42. x_k, x_v = x_kv.unbind(0) # b, c, hw
  43. char_token = self.char_token.tile([B, 1, 1])
  44. attn_ctc2d = char_token @ x_k # b, 1, hw
  45. attn_ctc2d = attn_ctc2d.reshape([-1, 1, H, W])
  46. attn_ctc2d = F.softmax(attn_ctc2d, 2) # b, 1, h, w
  47. attn_ctc2d = attn_ctc2d.permute(0, 3, 1, 2) # b, w, 1, h
  48. x_v = x_v.reshape(B, C, H, W)
  49. # B, W, H, C
  50. feats = attn_ctc2d @ x_v.permute(0, 3, 2, 1) # b, w, 1, c
  51. feats = feats.squeeze(2) # b, w, c
  52. predicts = self.fc(feats)
  53. if self.return_feats:
  54. result = (feats, predicts)
  55. else:
  56. result = predicts
  57. if not self.training:
  58. predicts = F.softmax(predicts, dim=2)
  59. result = predicts
  60. return result