vap_nod API

`maai.models.vap_nod`

`VapGPT_nod`

Bases: Module

Voice Activity Projection with Nodding prediction (VAP-Nod) model.

This model predicts both backchannels and nodding behaviors (e.g., short nod, long nod) during a conversation.

Source code in src/maai/models/vap_nod.py

class VapGPT_nod(nn.Module):
    """Voice Activity Projection with Nodding prediction (VAP-Nod) model.

    This model predicts both backchannels and nodding behaviors (e.g., short nod,
    long nod) during a conversation.
    """
    def __init__(self, conf: Optional[VapConfig] = None):
        """Initialize the VapGPT_nod model.

        Args:
            conf (Optional[VapConfig]): Configuration object.
                If None, default VapConfig is used.
        """
        super().__init__()
        if conf is None:
            conf = VapConfig()
        self.conf = conf
        self.sample_rate = conf.sample_rate
        self.frame_hz = conf.frame_hz

        self.temp_elapse_time = []

        # Single channel
        self.self_attention = GPT(
            dim=conf.dim,
            dff_k=3,
            num_layers=conf.channel_layers,
            num_heads=conf.num_heads,
            dropout=conf.dropout,
            context_limit=conf.context_limit,
        )

        # Cross channel
        self.cross_attention = GPTStereo(
            dim=conf.dim,
            dff_k=3,
            num_layers=conf.cross_layers,
            num_heads=conf.num_heads,
            dropout=conf.dropout,
            context_limit=conf.context_limit,
        )

        self.objective = ObjectiveVAP(bin_times=conf.bin_times, frame_hz=conf.frame_hz)

        # Outputs
        # Voice activity objective -> x1, x2 -> logits ->  BCE
        self.va_classifier = nn.Linear(conf.dim, 1)

        if self.conf.lid_classify == 1:
            self.lid_classifier = nn.Linear(conf.dim, conf.lid_classify_num_class)

        elif self.conf.lid_classify == 2:
            self.lid_classifier_middle = nn.Linear(conf.dim*2, conf.lid_classify_num_class)

        if self.conf.lang_cond == 1:
            self.lang_condition = nn.Linear(conf.lid_classify_num_class, conf.dim)

        self.vap_head = nn.Linear(conf.dim, self.objective.n_classes)

        # For Nodding
        self.gt_head = nn.Linear(conf.dim, 4)
        self.bc_head = nn.Linear(conf.dim, 1)

    def load_encoder(self, cpc_model):
        """Load and build the audio encoders for both speakers.

        Args:
            cpc_model: Pre-trained CPC model to be used as feature extractor.
        """
        self.encoder1 = build_audio_encoder(self.conf, cpc_model=cpc_model)
        self.encoder1 = self.encoder1.eval()
        self.encoder2 = build_audio_encoder(self.conf, cpc_model=cpc_model)
        self.encoder2 = self.encoder2.eval()

        encoder_dim = getattr(self.encoder1, "output_dim", self.conf.dim)
        if encoder_dim != self.conf.dim:
            self.decrease_dimension = nn.Linear(encoder_dim, self.conf.dim)

        if self.conf.freeze_encoder == 1:
            print('freeze encoder')
            self.encoder1.freeze()
            self.encoder2.freeze()

    @property
    def horizon_time(self):
        """Get the horizon time for the projection in seconds.

        Returns:
            float: Horizon time for the objective.
        """
        return self.objective.horizon_time

    def encode_audio(self, audio1: torch.Tensor, audio2: torch.Tensor) -> Tuple[Tensor, Tensor]:
        """Encode the raw audio inputs into feature representations.

        Note: Channel swap is applied for temporal consistency.

        Args:
            audio1 (torch.Tensor): Audio waveform for speaker 1 (User).
            audio2 (torch.Tensor): Audio waveform for speaker 2 (System).

        Returns:
            Tuple[Tensor, Tensor]: Encoded features for the two speakers.
        """

        # Channel swap for temporal consistency
        x1 = self.encoder1(audio2)  # speaker 1 (User)
        x2 = self.encoder2(audio1)  # speaker 2 (System)

        if hasattr(self, "decrease_dimension"):
            x1 = self.decrease_dimension(x1)
            x2 = self.decrease_dimension(x2)

        return x1, x2

    def vad_loss(self, vad_output, vad):
        """Compute the Voice Activity Detection (VAD) loss.

        Args:
            vad_output: Predicted VAD logits.
            vad: Ground truth VAD labels.

        Returns:
            Tensor: Binary cross-entropy loss between predictions and targets.
        """
        return F.binary_cross_entropy_with_logits(vad_output, vad)

    def forward(
        self,
        x1: Tensor,
        x2: Tensor,
        cache: Optional[dict] = None,
    ) -> Tuple[dict, dict]:
        """
        Forward pass for the VapGPT_nod model.

        Args:
            x1 (Tensor): Input audio embedded tensor for speaker 1.
            x2 (Tensor): Input audio embedded tensor for speaker 2.
            cache (dict, optional): Cache of past keys/values.

        Returns:
            Tuple[dict, dict]: Model outputs and updated cache.
        """

        if cache is None:
            cache = {}

        o1 = self.self_attention(x1, past_kv=cache.get("ar1"))
        o2 = self.self_attention(x2, past_kv=cache.get("ar2"))
        out = self.cross_attention(
            o1["x"],
            o2["x"],
            past_kv1=cache.get("cross1"),
            past_kv2=cache.get("cross2"),
            past_kv1_c=cache.get("cross1_c"),
            past_kv2_c=cache.get("cross2_c"),
        )

        new_cache = {
            "ar1": (o1["past_k"], o1["past_v"]),
            "ar2": (o2["past_k"], o2["past_v"]),
            "cross1": (out["past_k1"], out["past_v1"]),
            "cross2": (out["past_k2"], out["past_v2"]),
            "cross1_c": (out["past_k1_c"], out["past_v1_c"]),
            "cross2_c": (out["past_k2_c"], out["past_v2_c"]),
        }

        p_bc = self.bc_head(out["x"])
        nod = self.gt_head(out["x"])

        p_bc = p_bc.sigmoid().to("cpu").tolist()[0][-1][0]
        nod_ = nod.softmax(dim=-1).to("cpu").tolist()[0][-1]
        p_nod_short = nod_[1]
        p_nod_long = nod_[2]
        p_nod_long_p = nod_[3]

        ret = {
            "p_bc": p_bc,
            "p_nod_short": p_nod_short,
            "p_nod_long": p_nod_long,
            "p_nod_long_p": p_nod_long_p,
        }

        return ret, new_cache

`horizon_time` `property`

Get the horizon time for the projection in seconds.

Returns:

Name	Type	Description
`float`		Horizon time for the objective.

`init(conf=None)`

Initialize the VapGPT_nod model.

Parameters:

Name	Type	Description	Default
`conf`	`Optional[VapConfig]`	Configuration object. If None, default VapConfig is used.	`None`

Source code in src/maai/models/vap_nod.py

def __init__(self, conf: Optional[VapConfig] = None):
    """Initialize the VapGPT_nod model.

    Args:
        conf (Optional[VapConfig]): Configuration object.
            If None, default VapConfig is used.
    """
    super().__init__()
    if conf is None:
        conf = VapConfig()
    self.conf = conf
    self.sample_rate = conf.sample_rate
    self.frame_hz = conf.frame_hz

    self.temp_elapse_time = []

    # Single channel
    self.self_attention = GPT(
        dim=conf.dim,
        dff_k=3,
        num_layers=conf.channel_layers,
        num_heads=conf.num_heads,
        dropout=conf.dropout,
        context_limit=conf.context_limit,
    )

    # Cross channel
    self.cross_attention = GPTStereo(
        dim=conf.dim,
        dff_k=3,
        num_layers=conf.cross_layers,
        num_heads=conf.num_heads,
        dropout=conf.dropout,
        context_limit=conf.context_limit,
    )

    self.objective = ObjectiveVAP(bin_times=conf.bin_times, frame_hz=conf.frame_hz)

    # Outputs
    # Voice activity objective -> x1, x2 -> logits ->  BCE
    self.va_classifier = nn.Linear(conf.dim, 1)

    if self.conf.lid_classify == 1:
        self.lid_classifier = nn.Linear(conf.dim, conf.lid_classify_num_class)

    elif self.conf.lid_classify == 2:
        self.lid_classifier_middle = nn.Linear(conf.dim*2, conf.lid_classify_num_class)

    if self.conf.lang_cond == 1:
        self.lang_condition = nn.Linear(conf.lid_classify_num_class, conf.dim)

    self.vap_head = nn.Linear(conf.dim, self.objective.n_classes)

    # For Nodding
    self.gt_head = nn.Linear(conf.dim, 4)
    self.bc_head = nn.Linear(conf.dim, 1)

`encode_audio(audio1, audio2)`

Encode the raw audio inputs into feature representations.

Note: Channel swap is applied for temporal consistency.

Parameters:

Name	Type	Description	Default
`audio1`	`Tensor`	Audio waveform for speaker 1 (User).	required
`audio2`	`Tensor`	Audio waveform for speaker 2 (System).	required

Returns:

Type	Description
`Tuple[Tensor, Tensor]`	Tuple[Tensor, Tensor]: Encoded features for the two speakers.

Source code in src/maai/models/vap_nod.py

def encode_audio(self, audio1: torch.Tensor, audio2: torch.Tensor) -> Tuple[Tensor, Tensor]:
    """Encode the raw audio inputs into feature representations.

    Note: Channel swap is applied for temporal consistency.

    Args:
        audio1 (torch.Tensor): Audio waveform for speaker 1 (User).
        audio2 (torch.Tensor): Audio waveform for speaker 2 (System).

    Returns:
        Tuple[Tensor, Tensor]: Encoded features for the two speakers.
    """

    # Channel swap for temporal consistency
    x1 = self.encoder1(audio2)  # speaker 1 (User)
    x2 = self.encoder2(audio1)  # speaker 2 (System)

    if hasattr(self, "decrease_dimension"):
        x1 = self.decrease_dimension(x1)
        x2 = self.decrease_dimension(x2)

    return x1, x2

`forward(x1, x2, cache=None)`

Forward pass for the VapGPT_nod model.

Parameters:

Name	Type	Description	Default
`x1`	`Tensor`	Input audio embedded tensor for speaker 1.	required
`x2`	`Tensor`	Input audio embedded tensor for speaker 2.	required
`cache`	`dict`	Cache of past keys/values.	`None`

Returns:

Type	Description
`Tuple[dict, dict]`	Tuple[dict, dict]: Model outputs and updated cache.

Source code in src/maai/models/vap_nod.py

def forward(
    self,
    x1: Tensor,
    x2: Tensor,
    cache: Optional[dict] = None,
) -> Tuple[dict, dict]:
    """
    Forward pass for the VapGPT_nod model.

    Args:
        x1 (Tensor): Input audio embedded tensor for speaker 1.
        x2 (Tensor): Input audio embedded tensor for speaker 2.
        cache (dict, optional): Cache of past keys/values.

    Returns:
        Tuple[dict, dict]: Model outputs and updated cache.
    """

    if cache is None:
        cache = {}

    o1 = self.self_attention(x1, past_kv=cache.get("ar1"))
    o2 = self.self_attention(x2, past_kv=cache.get("ar2"))
    out = self.cross_attention(
        o1["x"],
        o2["x"],
        past_kv1=cache.get("cross1"),
        past_kv2=cache.get("cross2"),
        past_kv1_c=cache.get("cross1_c"),
        past_kv2_c=cache.get("cross2_c"),
    )

    new_cache = {
        "ar1": (o1["past_k"], o1["past_v"]),
        "ar2": (o2["past_k"], o2["past_v"]),
        "cross1": (out["past_k1"], out["past_v1"]),
        "cross2": (out["past_k2"], out["past_v2"]),
        "cross1_c": (out["past_k1_c"], out["past_v1_c"]),
        "cross2_c": (out["past_k2_c"], out["past_v2_c"]),
    }

    p_bc = self.bc_head(out["x"])
    nod = self.gt_head(out["x"])

    p_bc = p_bc.sigmoid().to("cpu").tolist()[0][-1][0]
    nod_ = nod.softmax(dim=-1).to("cpu").tolist()[0][-1]
    p_nod_short = nod_[1]
    p_nod_long = nod_[2]
    p_nod_long_p = nod_[3]

    ret = {
        "p_bc": p_bc,
        "p_nod_short": p_nod_short,
        "p_nod_long": p_nod_long,
        "p_nod_long_p": p_nod_long_p,
    }

    return ret, new_cache

`load_encoder(cpc_model)`

Load and build the audio encoders for both speakers.

Parameters:

Name	Type	Description	Default
`cpc_model`		Pre-trained CPC model to be used as feature extractor.	required

Source code in src/maai/models/vap_nod.py

def load_encoder(self, cpc_model):
    """Load and build the audio encoders for both speakers.

    Args:
        cpc_model: Pre-trained CPC model to be used as feature extractor.
    """
    self.encoder1 = build_audio_encoder(self.conf, cpc_model=cpc_model)
    self.encoder1 = self.encoder1.eval()
    self.encoder2 = build_audio_encoder(self.conf, cpc_model=cpc_model)
    self.encoder2 = self.encoder2.eval()

    encoder_dim = getattr(self.encoder1, "output_dim", self.conf.dim)
    if encoder_dim != self.conf.dim:
        self.decrease_dimension = nn.Linear(encoder_dim, self.conf.dim)

    if self.conf.freeze_encoder == 1:
        print('freeze encoder')
        self.encoder1.freeze()
        self.encoder2.freeze()

`vad_loss(vad_output, vad)`

Compute the Voice Activity Detection (VAD) loss.

Parameters:

Name	Type	Description	Default
`vad_output`		Predicted VAD logits.	required
`vad`		Ground truth VAD labels.	required

Returns:

Name	Type	Description
`Tensor`		Binary cross-entropy loss between predictions and targets.

Source code in src/maai/models/vap_nod.py

def vad_loss(self, vad_output, vad):
    """Compute the Voice Activity Detection (VAD) loss.

    Args:
        vad_output: Predicted VAD logits.
        vad: Ground truth VAD labels.

    Returns:
        Tensor: Binary cross-entropy loss between predictions and targets.
    """
    return F.binary_cross_entropy_with_logits(vad_output, vad)

vap_nod API

maai.models.vap_nod

VapGPT_nod

horizon_time property

__init__(conf=None)

encode_audio(audio1, audio2)

forward(x1, x2, cache=None)

load_encoder(cpc_model)

vad_loss(vad_output, vad)

`maai.models.vap_nod`

`VapGPT_nod`

`horizon_time` `property`

`init(conf=None)`

`encode_audio(audio1, audio2)`

`forward(x1, x2, cache=None)`

`load_encoder(cpc_model)`

`vad_loss(vad_output, vad)`