mirror of
https://github.com/classilla/tenfourfox.git
synced 2024-06-14 15:30:07 +00:00
311 lines
9.6 KiB
C
311 lines
9.6 KiB
C
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
/* ====================================================================
|
|
* Copyright (c) 2010 Carnegie Mellon University. All rights
|
|
* reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* This work was supported in part by funding from the Defense Advanced
|
|
* Research Projects Agency and the National Science Foundation of the
|
|
* United States of America, and the CMU Sphinx Speech Consortium.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
|
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
|
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* ====================================================================
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file state_align_search.c State (and phone and word) alignment search.
|
|
*/
|
|
|
|
#include "state_align_search.h"
|
|
|
|
static int
|
|
state_align_search_start(ps_search_t *search)
|
|
{
|
|
state_align_search_t *sas = (state_align_search_t *)search;
|
|
|
|
/* Activate the initial state. */
|
|
hmm_enter(sas->hmms, 0, 0, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
renormalize_hmms(state_align_search_t *sas, int frame_idx, int32 norm)
|
|
{
|
|
int i;
|
|
for (i = 0; i < sas->n_phones; ++i)
|
|
hmm_normalize(sas->hmms + i, norm);
|
|
}
|
|
|
|
static int32
|
|
evaluate_hmms(state_align_search_t *sas, int16 const *senscr, int frame_idx)
|
|
{
|
|
int32 bs = WORST_SCORE;
|
|
int i;
|
|
|
|
hmm_context_set_senscore(sas->hmmctx, senscr);
|
|
|
|
for (i = 0; i < sas->n_phones; ++i) {
|
|
hmm_t *hmm = sas->hmms + i;
|
|
int32 score;
|
|
|
|
if (hmm_frame(hmm) < frame_idx)
|
|
continue;
|
|
score = hmm_vit_eval(hmm);
|
|
if (score BETTER_THAN bs) {
|
|
bs = score;
|
|
}
|
|
}
|
|
return bs;
|
|
}
|
|
|
|
static void
|
|
prune_hmms(state_align_search_t *sas, int frame_idx)
|
|
{
|
|
int nf = frame_idx + 1;
|
|
int i;
|
|
|
|
/* Check all phones to see if they remain active in the next frame. */
|
|
for (i = 0; i < sas->n_phones; ++i) {
|
|
hmm_t *hmm = sas->hmms + i;
|
|
if (hmm_frame(hmm) < frame_idx)
|
|
continue;
|
|
hmm_frame(hmm) = nf;
|
|
}
|
|
}
|
|
|
|
static void
|
|
phone_transition(state_align_search_t *sas, int frame_idx)
|
|
{
|
|
int nf = frame_idx + 1;
|
|
int i;
|
|
|
|
for (i = 0; i < sas->n_phones - 1; ++i) {
|
|
hmm_t *hmm, *nhmm;
|
|
int32 newphone_score;
|
|
|
|
hmm = sas->hmms + i;
|
|
if (hmm_frame(hmm) != nf)
|
|
continue;
|
|
|
|
newphone_score = hmm_out_score(hmm);
|
|
/* Transition into next phone using the usual Viterbi rule. */
|
|
nhmm = hmm + 1;
|
|
if (hmm_frame(nhmm) < frame_idx
|
|
|| newphone_score BETTER_THAN hmm_in_score(nhmm)) {
|
|
hmm_enter(nhmm, newphone_score, hmm_out_history(hmm), nf);
|
|
}
|
|
}
|
|
}
|
|
|
|
#define TOKEN_STEP 20
|
|
static void
|
|
extend_tokenstack(state_align_search_t *sas, int frame_idx)
|
|
{
|
|
if (frame_idx >= sas->n_fr_alloc) {
|
|
sas->n_fr_alloc = frame_idx + TOKEN_STEP + 1;
|
|
sas->tokens = ckd_realloc(sas->tokens,
|
|
sas->n_emit_state * sas->n_fr_alloc
|
|
* sizeof(*sas->tokens));
|
|
}
|
|
memset(sas->tokens + frame_idx * sas->n_emit_state, 0xff,
|
|
sas->n_emit_state * sizeof(*sas->tokens));
|
|
}
|
|
|
|
static void
|
|
record_transitions(state_align_search_t *sas, int frame_idx)
|
|
{
|
|
uint16 *tokens;
|
|
int i;
|
|
|
|
/* Push another frame of tokens on the stack. */
|
|
extend_tokenstack(sas, frame_idx);
|
|
tokens = sas->tokens + frame_idx * sas->n_emit_state;
|
|
|
|
/* Scan all active HMMs */
|
|
for (i = 0; i < sas->n_phones; ++i) {
|
|
hmm_t *hmm = sas->hmms + i;
|
|
int j;
|
|
|
|
if (hmm_frame(hmm) < frame_idx)
|
|
continue;
|
|
for (j = 0; j < sas->hmmctx->n_emit_state; ++j) {
|
|
int state_idx = i * sas->hmmctx->n_emit_state + j;
|
|
/* Record their backpointers on the token stack. */
|
|
tokens[state_idx] = hmm_history(hmm, j);
|
|
/* Update backpointer fields with state index. */
|
|
hmm_history(hmm, j) = state_idx;
|
|
}
|
|
}
|
|
}
|
|
|
|
static int
|
|
state_align_search_step(ps_search_t *search, int frame_idx)
|
|
{
|
|
state_align_search_t *sas = (state_align_search_t *)search;
|
|
acmod_t *acmod = ps_search_acmod(search);
|
|
int16 const *senscr;
|
|
int i;
|
|
|
|
/* Calculate senone scores. */
|
|
for (i = 0; i < sas->n_phones; ++i)
|
|
acmod_activate_hmm(acmod, sas->hmms + i);
|
|
senscr = acmod_score(acmod, &frame_idx);
|
|
|
|
/* Renormalize here if needed. */
|
|
/* FIXME: Make sure to (unit-)test this!!! */
|
|
if ((sas->best_score - 0x300000) WORSE_THAN WORST_SCORE) {
|
|
E_INFO("Renormalizing Scores at frame %d, best score %d\n",
|
|
frame_idx, sas->best_score);
|
|
renormalize_hmms(sas, frame_idx, sas->best_score);
|
|
}
|
|
|
|
/* Viterbi step. */
|
|
sas->best_score = evaluate_hmms(sas, senscr, frame_idx);
|
|
prune_hmms(sas, frame_idx);
|
|
|
|
/* Transition out of non-emitting states. */
|
|
phone_transition(sas, frame_idx);
|
|
|
|
/* Generate new tokens from best path results. */
|
|
record_transitions(sas, frame_idx);
|
|
|
|
/* Update frame counter */
|
|
sas->frame = frame_idx;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
state_align_search_finish(ps_search_t *search)
|
|
{
|
|
state_align_search_t *sas = (state_align_search_t *)search;
|
|
hmm_t *final_phone = sas->hmms + sas->n_phones - 1;
|
|
ps_alignment_iter_t *itor;
|
|
ps_alignment_entry_t *ent;
|
|
int next_state, next_start, state, frame;
|
|
|
|
/* Best state exiting the last frame. */
|
|
next_state = state = hmm_out_history(final_phone);
|
|
if (state == 0xffff) {
|
|
E_ERROR("Failed to reach final state in alignment\n");
|
|
return -1;
|
|
}
|
|
itor = ps_alignment_states(sas->al);
|
|
next_start = sas->frame + 1;
|
|
for (frame = sas->frame - 1; frame >= 0; --frame) {
|
|
state = sas->tokens[frame * sas->n_emit_state + state];
|
|
/* State boundary, update alignment entry for next state. */
|
|
if (state != next_state) {
|
|
itor = ps_alignment_iter_goto(itor, next_state);
|
|
assert(itor != NULL);
|
|
ent = ps_alignment_iter_get(itor);
|
|
ent->start = frame + 1;
|
|
ent->duration = next_start - ent->start;
|
|
E_DEBUG(1,("state %d start %d end %d\n", next_state,
|
|
ent->start, next_start));
|
|
next_state = state;
|
|
next_start = frame + 1;
|
|
}
|
|
}
|
|
/* Update alignment entry for initial state. */
|
|
itor = ps_alignment_iter_goto(itor, 0);
|
|
assert(itor != NULL);
|
|
ent = ps_alignment_iter_get(itor);
|
|
ent->start = 0;
|
|
ent->duration = next_start;
|
|
E_DEBUG(1,("state %d start %d end %d\n", 0,
|
|
ent->start, next_start));
|
|
ps_alignment_iter_free(itor);
|
|
ps_alignment_propagate(sas->al);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
state_align_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p)
|
|
{
|
|
/* This does nothing. */
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
state_align_search_free(ps_search_t *search)
|
|
{
|
|
state_align_search_t *sas = (state_align_search_t *)search;
|
|
ps_search_deinit(search);
|
|
ckd_free(sas->hmms);
|
|
ckd_free(sas->tokens);
|
|
hmm_context_free(sas->hmmctx);
|
|
ckd_free(sas);
|
|
}
|
|
|
|
static ps_searchfuncs_t state_align_search_funcs = {
|
|
/* name: */ "state_align",
|
|
/* start: */ state_align_search_start,
|
|
/* step: */ state_align_search_step,
|
|
/* finish: */ state_align_search_finish,
|
|
/* reinit: */ state_align_search_reinit,
|
|
/* free: */ state_align_search_free,
|
|
/* lattice: */ NULL,
|
|
/* hyp: */ NULL,
|
|
/* prob: */ NULL,
|
|
/* seg_iter: */ NULL,
|
|
};
|
|
|
|
ps_search_t *
|
|
state_align_search_init(cmd_ln_t *config,
|
|
acmod_t *acmod,
|
|
ps_alignment_t *al)
|
|
{
|
|
state_align_search_t *sas;
|
|
ps_alignment_iter_t *itor;
|
|
hmm_t *hmm;
|
|
|
|
sas = ckd_calloc(1, sizeof(*sas));
|
|
ps_search_init(ps_search_base(sas), &state_align_search_funcs,
|
|
config, acmod, al->d2p->dict, al->d2p);
|
|
sas->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef),
|
|
acmod->tmat->tp, NULL, acmod->mdef->sseq);
|
|
if (sas->hmmctx == NULL) {
|
|
ckd_free(sas);
|
|
return NULL;
|
|
}
|
|
sas->al = al;
|
|
|
|
/* Generate HMM vector from phone level of alignment. */
|
|
sas->n_phones = ps_alignment_n_phones(al);
|
|
sas->n_emit_state = ps_alignment_n_states(al);
|
|
sas->hmms = ckd_calloc(sas->n_phones, sizeof(*sas->hmms));
|
|
for (hmm = sas->hmms, itor = ps_alignment_phones(al); itor;
|
|
++hmm, itor = ps_alignment_iter_next(itor)) {
|
|
ps_alignment_entry_t *ent = ps_alignment_iter_get(itor);
|
|
hmm_init(sas->hmmctx, hmm, FALSE,
|
|
ent->id.pid.ssid, ent->id.pid.tmatid);
|
|
}
|
|
return ps_search_base(sas);
|
|
}
|