python
/
OpenOCR


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749
							import math

import torch
import torch.nn as nn
import torch.nn.functional as F


class RobustScannerDecoder(nn.Module):

    def __init__(
            self,
            out_channels,  # 90 + unknown + start + padding
            in_channels,
            enc_outchannles=128,
            hybrid_dec_rnn_layers=2,
            hybrid_dec_dropout=0,
            position_dec_rnn_layers=2,
            max_len=25,
            mask=True,
            encode_value=False,
            **kwargs):
        super(RobustScannerDecoder, self).__init__()

        start_idx = out_channels - 2
        padding_idx = out_channels - 1
        end_idx = 0
        # encoder module
        self.encoder = ChannelReductionEncoder(in_channels=in_channels,
                                               out_channels=enc_outchannles)
        self.max_text_length = max_len + 1
        self.mask = mask
        # decoder module
        self.decoder = Decoder(
            num_classes=out_channels,
            dim_input=in_channels,
            dim_model=enc_outchannles,
            hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers,
            hybrid_decoder_dropout=hybrid_dec_dropout,
            position_decoder_rnn_layers=position_dec_rnn_layers,
            max_len=max_len + 1,
            start_idx=start_idx,
            mask=mask,
            padding_idx=padding_idx,
            end_idx=end_idx,
            encode_value=encode_value)

    def forward(self, inputs, data=None):
        '''
        data: [label, valid_ratio, 'length']
        '''
        out_enc = self.encoder(inputs)
        bs = out_enc.shape[0]
        valid_ratios = None
        word_positions = torch.arange(0,
                                      self.max_text_length,
                                      device=inputs.device).unsqueeze(0).tile(
                                          [bs, 1])

        if self.mask:
            valid_ratios = data[-1]

        if self.training:
            max_len = data[1].max()
            label = data[0][:, :1 + max_len]  # label
            final_out = self.decoder(inputs, out_enc, label, valid_ratios,
                                     word_positions[:, :1 + max_len])
        if not self.training:
            final_out = self.decoder(inputs,
                                     out_enc,
                                     label=None,
                                     valid_ratios=valid_ratios,
                                     word_positions=word_positions,
                                     train_mode=False)
        return final_out


class BaseDecoder(nn.Module):

    def __init__(self, **kwargs):
        super().__init__()

    def forward_train(self, feat, out_enc, targets, img_metas):
        raise NotImplementedError

    def forward_test(self, feat, out_enc, img_metas):
        raise NotImplementedError

    def forward(self,
                feat,
                out_enc,
                label=None,
                valid_ratios=None,
                word_positions=None,
                train_mode=True):
        self.train_mode = train_mode

        if train_mode:
            return self.forward_train(feat, out_enc, label, valid_ratios,
                                      word_positions)
        return self.forward_test(feat, out_enc, valid_ratios, word_positions)


class ChannelReductionEncoder(nn.Module):
    """Change the channel number with a one by one convoluational layer.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
    """

    def __init__(self, in_channels, out_channels, **kwargs):
        super(ChannelReductionEncoder, self).__init__()

        weight = torch.nn.Parameter(
            torch.nn.init.xavier_normal_(torch.empty(out_channels, in_channels,
                                                     1, 1),
                                         gain=1.0))
        self.layer = nn.Conv2d(in_channels,
                               out_channels,
                               kernel_size=1,
                               stride=1,
                               padding=0)

        use_xavier_normal = 1
        if use_xavier_normal:
            self.layer.weight = weight

    def forward(self, feat):
        """
        Args:
            feat (Tensor): Image features with the shape of
                :math:`(N, C_{in}, H, W)`.

        Returns:
            Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`.
        """
        return self.layer(feat)


def masked_fill(x, mask, value):
    y = torch.full(x.shape, value, x.dtype)
    return torch.where(mask, y, x)


class DotProductAttentionLayer(nn.Module):

    def __init__(self, dim_model=None):
        super().__init__()

        self.scale = dim_model**-0.5 if dim_model is not None else 1.

    def forward(self, query, key, value, mask=None):

        query = query.permute(0, 2, 1)
        logits = query @ key * self.scale

        if mask is not None:
            n, seq_len = mask.size()
            mask = mask.view(n, 1, seq_len)
            logits = logits.masked_fill(mask, float('-inf'))

        weights = F.softmax(logits, dim=2)
        value = value.transpose(1, 2)
        glimpse = weights @ value
        glimpse = glimpse.permute(0, 2, 1).contiguous()
        return glimpse


class SequenceAttentionDecoder(BaseDecoder):
    """Sequence attention decoder for RobustScanner.

    RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
    Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_

    Args:
        num_classes (int): Number of output classes :math:`C`.
        rnn_layers (int): Number of RNN layers.
        dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
        dim_model (int): Dimension :math:`D_m` of the model. Should also be the
            same as encoder output vector ``out_enc``.
        max_seq_len (int): Maximum output sequence length :math:`T`.
        start_idx (int): The index of `<SOS>`.
        mask (bool): Whether to mask input features according to
            ``img_meta['valid_ratio']``.
        padding_idx (int): The index of `<PAD>`.
        dropout (float): Dropout rate.
        return_feature (bool): Return feature or logits as the result.
        encode_value (bool): Whether to use the output of encoder ``out_enc``
            as `value` of attention layer. If False, the original feature
            ``feat`` will be used.

    Warning:
        This decoder will not predict the final class which is assumed to be
        `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
        is also ignored by loss as specified in
        :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`.
    """

    def __init__(self,
                 num_classes=None,
                 rnn_layers=2,
                 dim_input=512,
                 dim_model=128,
                 max_seq_len=40,
                 start_idx=0,
                 mask=True,
                 padding_idx=None,
                 dropout=0,
                 return_feature=False,
                 encode_value=False):
        super().__init__()

        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_model = dim_model
        self.return_feature = return_feature
        self.encode_value = encode_value
        self.max_seq_len = max_seq_len
        self.start_idx = start_idx
        self.mask = mask

        self.embedding = nn.Embedding(self.num_classes,
                                      self.dim_model,
                                      padding_idx=padding_idx)

        self.sequence_layer = nn.LSTM(input_size=dim_model,
                                      hidden_size=dim_model,
                                      num_layers=rnn_layers,
                                      batch_first=True,
                                      dropout=dropout)

        self.attention_layer = DotProductAttentionLayer()

        self.prediction = None
        if not self.return_feature:
            pred_num_classes = num_classes - 1
            self.prediction = nn.Linear(
                dim_model if encode_value else dim_input, pred_num_classes)

    def forward_train(self, feat, out_enc, targets, valid_ratios):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a
                character.
            valid_ratios (Tensor): valid length ratio of img.
        Returns:
            Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
            ``return_feature=False``. Otherwise it would be the hidden feature
            before the prediction projection layer, whose shape is
            :math:`(N, T, D_m)`.
        """

        tgt_embedding = self.embedding(targets)

        n, c_enc, h, w = out_enc.shape
        assert c_enc == self.dim_model
        _, c_feat, _, _ = feat.shape
        assert c_feat == self.dim_input
        _, len_q, c_q = tgt_embedding.shape
        assert c_q == self.dim_model
        assert len_q <= self.max_seq_len

        query, _ = self.sequence_layer(tgt_embedding)

        query = query.permute(0, 2, 1).contiguous()

        key = out_enc.view(n, c_enc, h * w)

        if self.encode_value:
            value = key
        else:
            value = feat.view(n, c_feat, h * w)

        mask = None
        if valid_ratios is not None:
            mask = query.new_zeros((n, h, w))
            for i, valid_ratio in enumerate(valid_ratios):
                valid_width = min(w, math.ceil(w * valid_ratio))
                mask[i, :, valid_width:] = 1
            mask = mask.bool()
            mask = mask.view(n, h * w)

        attn_out = self.attention_layer(query, key, value, mask)
        attn_out = attn_out.permute(0, 2, 1).contiguous()

        if self.return_feature:
            return attn_out

        out = self.prediction(attn_out)

        return out

    def forward_test(self, feat, out_enc, valid_ratios):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            valid_ratios (Tensor): valid length ratio of img.

        Returns:
            Tensor: The output logit sequence tensor of shape
            :math:`(N, T, C-1)`.
        """
        batch_size = feat.shape[0]

        decode_sequence = (torch.ones((batch_size, self.max_seq_len),
                                      dtype=torch.int64,
                                      device=feat.device) * self.start_idx)

        outputs = []
        for i in range(self.max_seq_len):
            step_out = self.forward_test_step(feat, out_enc, decode_sequence,
                                              i, valid_ratios)
            outputs.append(step_out)
            max_idx = torch.argmax(step_out, dim=1, keepdim=False)
            if i < self.max_seq_len - 1:
                decode_sequence[:, i + 1] = max_idx

        outputs = torch.stack(outputs, 1)

        return outputs

    def forward_test_step(self, feat, out_enc, decode_sequence, current_step,
                          valid_ratios):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that
                stores history decoding result.
            current_step (int): Current decoding step.
            valid_ratios (Tensor): valid length ratio of img

        Returns:
            Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted
            tokens at current time step.
        """

        embed = self.embedding(decode_sequence)

        n, c_enc, h, w = out_enc.shape
        assert c_enc == self.dim_model
        _, c_feat, _, _ = feat.shape
        assert c_feat == self.dim_input
        _, _, c_q = embed.shape
        assert c_q == self.dim_model

        query, _ = self.sequence_layer(embed)
        query = query.transpose(1, 2)
        key = torch.reshape(out_enc, (n, c_enc, h * w))
        if self.encode_value:
            value = key
        else:
            value = torch.reshape(feat, (n, c_feat, h * w))

        mask = None
        if valid_ratios is not None:
            mask = query.new_zeros((n, h, w))
            for i, valid_ratio in enumerate(valid_ratios):
                valid_width = min(w, math.ceil(w * valid_ratio))
                mask[i, :, valid_width:] = 1
            mask = mask.bool()
            mask = mask.view(n, h * w)

        # [n, c, l]
        attn_out = self.attention_layer(query, key, value, mask)
        out = attn_out[:, :, current_step]

        if self.return_feature:
            return out

        out = self.prediction(out)
        out = F.softmax(out, dim=-1)

        return out


class PositionAwareLayer(nn.Module):

    def __init__(self, dim_model, rnn_layers=2):
        super().__init__()

        self.dim_model = dim_model

        self.rnn = nn.LSTM(input_size=dim_model,
                           hidden_size=dim_model,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.mixer = nn.Sequential(
            nn.Conv2d(dim_model, dim_model, kernel_size=3, stride=1,
                      padding=1), nn.ReLU(True),
            nn.Conv2d(dim_model, dim_model, kernel_size=3, stride=1,
                      padding=1))

    def forward(self, img_feature):
        n, c, h, w = img_feature.shape
        rnn_input = img_feature.permute(0, 2, 3, 1).contiguous()
        rnn_input = rnn_input.view(n * h, w, c)
        rnn_output, _ = self.rnn(rnn_input)
        rnn_output = rnn_output.view(n, h, w, c)
        rnn_output = rnn_output.permute(0, 3, 1, 2).contiguous()

        out = self.mixer(rnn_output)
        return out


class PositionAttentionDecoder(BaseDecoder):
    """Position attention decoder for RobustScanner.

    RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
    Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_

    Args:
        num_classes (int): Number of output classes :math:`C`.
        rnn_layers (int): Number of RNN layers.
        dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
        dim_model (int): Dimension :math:`D_m` of the model. Should also be the
            same as encoder output vector ``out_enc``.
        max_seq_len (int): Maximum output sequence length :math:`T`.
        mask (bool): Whether to mask input features according to
            ``img_meta['valid_ratio']``.
        return_feature (bool): Return feature or logits as the result.
        encode_value (bool): Whether to use the output of encoder ``out_enc``
            as `value` of attention layer. If False, the original feature
            ``feat`` will be used.

    Warning:
        This decoder will not predict the final class which is assumed to be
        `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
        is also ignored by loss
    """

    def __init__(self,
                 num_classes=None,
                 rnn_layers=2,
                 dim_input=512,
                 dim_model=128,
                 max_seq_len=40,
                 mask=True,
                 return_feature=False,
                 encode_value=False):
        super().__init__()

        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_model = dim_model
        self.max_seq_len = max_seq_len
        self.return_feature = return_feature
        self.encode_value = encode_value
        self.mask = mask

        self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model)

        self.position_aware_module = PositionAwareLayer(
            self.dim_model, rnn_layers)

        self.attention_layer = DotProductAttentionLayer()

        self.prediction = None
        if not self.return_feature:
            pred_num_classes = num_classes - 1
            self.prediction = nn.Linear(
                dim_model if encode_value else dim_input, pred_num_classes)

    def _get_position_index(self, length, batch_size):
        position_index_list = []
        for i in range(batch_size):
            position_index = torch.range(0, length, step=1, dtype='int64')
            position_index_list.append(position_index)
        batch_position_index = torch.stack(position_index_list, dim=0)
        return batch_position_index

    def forward_train(self, feat, out_enc, targets, valid_ratios,
                      position_index):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            targets (dict): A dict with the key ``padded_targets``, a
                tensor of shape :math:`(N, T)`. Each element is the index of a
                character.
            valid_ratios (Tensor): valid length ratio of img.
            position_index (Tensor): The position of each word.

        Returns:
            Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
            ``return_feature=False``. Otherwise it will be the hidden feature
            before the prediction projection layer, whose shape is
            :math:`(N, T, D_m)`.
        """
        n, c_enc, h, w = out_enc.shape
        assert c_enc == self.dim_model
        _, c_feat, _, _ = feat.shape
        assert c_feat == self.dim_input
        _, len_q = targets.shape
        assert len_q <= self.max_seq_len

        position_out_enc = self.position_aware_module(out_enc)

        query = self.embedding(position_index)
        query = query.permute(0, 2, 1).contiguous()
        key = position_out_enc.view(n, c_enc, h * w)
        if self.encode_value:
            value = out_enc.view(n, c_enc, h * w)
        else:
            value = feat.view(n, c_feat, h * w)

        mask = None
        if valid_ratios is not None:
            mask = query.new_zeros((n, h, w))
            for i, valid_ratio in enumerate(valid_ratios):
                valid_width = min(w, math.ceil(w * valid_ratio))
                mask[i, :, valid_width:] = 1
            mask = mask.bool()
            mask = mask.view(n, h * w)

        attn_out = self.attention_layer(query, key, value, mask)
        attn_out = attn_out.permute(0, 2, 1).contiguous()

        if self.return_feature:
            return attn_out

        return self.prediction(attn_out)

    def forward_test(self, feat, out_enc, valid_ratios, position_index):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            valid_ratios (Tensor): valid length ratio of img
            position_index (Tensor): The position of each word.

        Returns:
            Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
            ``return_feature=False``. Otherwise it would be the hidden feature
            before the prediction projection layer, whose shape is
            :math:`(N, T, D_m)`.
        """
        n, c_enc, h, w = out_enc.shape
        assert c_enc == self.dim_model
        _, c_feat, _, _ = feat.shape
        assert c_feat == self.dim_input

        position_out_enc = self.position_aware_module(out_enc)

        query = self.embedding(position_index)
        query = query.permute(0, 2, 1).contiguous()
        key = position_out_enc.view(n, c_enc, h * w)
        if self.encode_value:
            value = torch.reshape(out_enc, (n, c_enc, h * w))
        else:
            value = torch.reshape(feat, (n, c_feat, h * w))

        mask = None
        if valid_ratios is not None:
            mask = query.new_zeros((n, h, w))
            for i, valid_ratio in enumerate(valid_ratios):
                valid_width = min(w, math.ceil(w * valid_ratio))
                mask[i, :, valid_width:] = 1
            mask = mask.bool()
            mask = mask.view(n, h * w)

        attn_out = self.attention_layer(query, key, value, mask)
        attn_out = attn_out.transpose(1, 2)  # [n, len_q, dim_v]

        if self.return_feature:
            return attn_out

        return self.prediction(attn_out)


class RobustScannerFusionLayer(nn.Module):

    def __init__(self, dim_model, dim=-1):
        super(RobustScannerFusionLayer, self).__init__()

        self.dim_model = dim_model
        self.dim = dim
        self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2)

    def forward(self, x0, x1):
        assert x0.shape == x1.shape
        fusion_input = torch.concat((x0, x1), self.dim)
        output = self.linear_layer(fusion_input)
        output = F.glu(output, self.dim)

        return output


class Decoder(BaseDecoder):
    """Decoder for RobustScanner.

    RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
    Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_

    Args:
        num_classes (int): Number of output classes :math:`C`.
        dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
        dim_model (int): Dimension :math:`D_m` of the model. Should also be the
            same as encoder output vector ``out_enc``.
        max_seq_len (int): Maximum output sequence length :math:`T`.
        start_idx (int): The index of `<SOS>`.
        mask (bool): Whether to mask input features according to
            ``img_meta['valid_ratio']``.
        padding_idx (int): The index of `<PAD>`.
        encode_value (bool): Whether to use the output of encoder ``out_enc``
            as `value` of attention layer. If False, the original feature
            ``feat`` will be used.

    Warning:
        This decoder will not predict the final class which is assumed to be
        `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
        is also ignored by loss as specified in
        :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`.
    """

    def __init__(self,
                 num_classes=None,
                 dim_input=512,
                 dim_model=128,
                 hybrid_decoder_rnn_layers=2,
                 hybrid_decoder_dropout=0,
                 position_decoder_rnn_layers=2,
                 max_len=40,
                 start_idx=0,
                 mask=True,
                 padding_idx=None,
                 end_idx=0,
                 encode_value=False):
        super().__init__()
        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_model = dim_model
        self.max_seq_len = max_len
        self.encode_value = encode_value
        self.start_idx = start_idx
        self.padding_idx = padding_idx
        self.end_idx = end_idx
        self.mask = mask

        # init hybrid decoder
        self.hybrid_decoder = SequenceAttentionDecoder(
            num_classes=num_classes,
            rnn_layers=hybrid_decoder_rnn_layers,
            dim_input=dim_input,
            dim_model=dim_model,
            max_seq_len=max_len,
            start_idx=start_idx,
            mask=mask,
            padding_idx=padding_idx,
            dropout=hybrid_decoder_dropout,
            encode_value=encode_value,
            return_feature=True)

        # init position decoder
        self.position_decoder = PositionAttentionDecoder(
            num_classes=num_classes,
            rnn_layers=position_decoder_rnn_layers,
            dim_input=dim_input,
            dim_model=dim_model,
            max_seq_len=max_len,
            mask=mask,
            encode_value=encode_value,
            return_feature=True)

        self.fusion_module = RobustScannerFusionLayer(
            self.dim_model if encode_value else dim_input)

        pred_num_classes = num_classes
        self.prediction = nn.Linear(dim_model if encode_value else dim_input,
                                    pred_num_classes)

    def forward_train(self, feat, out_enc, target, valid_ratios,
                      word_positions):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            target (dict): A dict with the key ``padded_targets``, a
                tensor of shape :math:`(N, T)`. Each element is the index of a
                character.
            valid_ratios (Tensor):
            word_positions (Tensor): The position of each word.

        Returns:
            Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`.
        """

        hybrid_glimpse = self.hybrid_decoder.forward_train(
            feat, out_enc, target, valid_ratios)
        position_glimpse = self.position_decoder.forward_train(
            feat, out_enc, target, valid_ratios, word_positions)

        fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse)

        out = self.prediction(fusion_out)

        return out

    def forward_test(self, feat, out_enc, valid_ratios, word_positions):
        """
        Args:
            feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
            out_enc (Tensor): Encoder output of shape
                :math:`(N, D_m, H, W)`.
            valid_ratios (Tensor):
            word_positions (Tensor): The position of each word.
        Returns:
            Tensor: The output logit sequence tensor of shape
            :math:`(N, T, C-1)`.
        """
        seq_len = self.max_seq_len
        batch_size = feat.shape[0]

        decode_sequence = (torch.ones(
            (batch_size, seq_len), dtype=torch.int64, device=feat.device) *
                           self.start_idx)

        position_glimpse = self.position_decoder.forward_test(
            feat, out_enc, valid_ratios, word_positions)

        outputs = []
        for i in range(seq_len):
            hybrid_glimpse_step = self.hybrid_decoder.forward_test_step(
                feat, out_enc, decode_sequence, i, valid_ratios)

            fusion_out = self.fusion_module(hybrid_glimpse_step,
                                            position_glimpse[:, i, :])

            char_out = self.prediction(fusion_out)
            char_out = F.softmax(char_out, -1)
            outputs.append(char_out)
            max_idx = torch.argmax(char_out, dim=1, keepdim=False)
            if i < seq_len - 1:
                decode_sequence[:, i + 1] = max_idx
                if (decode_sequence == self.end_idx).any(dim=-1).all():
                    break
        outputs = torch.stack(outputs, 1)

        return outputs