[PATCH 1/4] core/conv: Add optimized Viterbi decoding

historical

Hi Thomas,

I've just only seen this. I'll try to test and comment on them ASAP.

Some quick things I saw from a quick glance though:
 - Some things that should be 'static' are not and they pollute the
global namespace
 - Anything that is in the global namespace needs the osmo_ (or even
osmo_conv_ here) prefix
 - Did you check (and test in make test) that it works for all codes
that matches
   "if ((code->N <= 4) && ((code->K == 5) || (code->K == 7)))" ?
 - The viterbi_gen file seems fairly easy to generate, maybe in the
future it'd be worth autogenerating it
   Also, to avoid exporting those internal symbols, I'm wondering if
#including the viterbi_gen.c from viterbi.c instead of compiling
separately wouldn't be better. But that's just a random idea ...

> A beneficial change would be to modify the API to allow persistent
> decoder objects instead of performing an allocation and tear-down on
> every decoding call. From a performance standpoint, the current
> implementation is unnecessarily restricted by memory access and page
> faults for that reason.

You can't change the existing API without breaking existing stuff, but you can
add a new all-in-one function that allows to give a persistent decoder object.

like osmo_conv_decode_persist or whatever you want to call it.

Cheers,

    Sylvain

On Tue, Apr 29, 2014 at 6:12 AM, Thomas Tsou <tom at tsou.cc> wrote:
> Add a separate, faster convolution decoding implementation for rates
> up to N=4 and constraint lengths of K=5 and K=7, which covers the
> most GSM code uses. The decoding algorithm exploits the symmetric
> structure of the Viterbi add-compare-select (ACS) operation - commonly
> known as the ACS butterfly. This shift-register optimization can be
> found in the well-known text by Dave Forney.
>
> Forney, G.D., "The Viterbi Algorithm," Proc. of the IEEE, March 1973.
>
> Implementation is non-architecture specific and improves performance on
> x86 as well as ARM processors. Existing API is unchanged with optimized
> code being called internally for supported codes.
>
> Signed-off-by: Thomas Tsou <tom at tsou.cc>
> ---
>  src/Makefile.am   |   3 +-
>  src/conv.c        |   9 +
>  src/viterbi.c     | 603 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/viterbi_gen.c | 182 ++++++++++++++++
>  4 files changed, 796 insertions(+), 1 deletion(-)
>  create mode 100644 src/viterbi.c
>  create mode 100644 src/viterbi_gen.c
>
> diff --git a/src/Makefile.am b/src/Makefile.am
> index e68c29a..262a4e6 100644
> --- a/src/Makefile.am
> +++ b/src/Makefile.am
> @@ -13,7 +13,8 @@ libosmocore_la_SOURCES = timer.c select.c signal.c msgb.c bits.c \
>                          logging.c logging_syslog.c rate_ctr.c \
>                          gsmtap_util.c crc16.c panic.c backtrace.c \
>                          conv.c application.c rbtree.c strrb.c \
> -                        loggingrb.c crc8gen.c crc16gen.c crc32gen.c crc64gen.c
> +                        loggingrb.c crc8gen.c crc16gen.c crc32gen.c crc64gen.c \
> +                        viterbi.c viterbi_gen.c
>
>  BUILT_SOURCES = crc8gen.c crc16gen.c crc32gen.c crc64gen.c
>
> diff --git a/src/conv.c b/src/conv.c
> index f13deef..79b3a7c 100644
> --- a/src/conv.c
> +++ b/src/conv.c
> @@ -238,6 +238,11 @@ osmo_conv_encode(const struct osmo_conv_code *code,
>
>  #define MAX_AE 0x00ffffff
>
> +/* Forward declaration for accerlated decoding with certain codes */
> +int
> +osmo_conv_decode_acc(const struct osmo_conv_code *code,
> +                     const sbit_t *input, ubit_t *output);
> +
>  void
>  osmo_conv_decode_init(struct osmo_conv_decoder *decoder,
>                        const struct osmo_conv_code *code, int len, int start_state)
> @@ -606,6 +611,10 @@ osmo_conv_decode(const struct osmo_conv_code *code,
>         struct osmo_conv_decoder decoder;
>         int rv, l;
>
> +       /* Use accelerated implementation for supported codes */
> +       if ((code->N <= 4) && ((code->K == 5) || (code->K == 7)))
> +               return osmo_conv_decode_acc(code, input, output);
> +
>         osmo_conv_decode_init(&decoder, code, 0, 0);
>
>         if (code->term == CONV_TERM_TAIL_BITING) {
> diff --git a/src/viterbi.c b/src/viterbi.c
> new file mode 100644
> index 0000000..db8d5c8
> --- /dev/null
> +++ b/src/viterbi.c
> @@ -0,0 +1,603 @@
> +/*
> + * Viterbi decoder
> + * Copyright (C) 2013, 2014 Thomas Tsou <tom at tsou.cc>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include <stdlib.h>
> +#include <malloc.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <osmocom/core/conv.h>
> +
> +/* Forward Metric Units */
> +void gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +void gen_metrics_k5_n3(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +void gen_metrics_k5_n4(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +void gen_metrics_k7_n2(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +void gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +void gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm);
> +/* Trellis State
> + *     state - Internal lshift register value
> + *     prev  - Register values of previous 0 and 1 states
> + */
> +struct vstate {
> +       unsigned state;
> +       unsigned prev[2];
> +};
> +
> +/* Trellis Object
> + *     num_states - Number of states in the trellis
> + *     sums       - Accumulated path metrics
> + *     outputs    - Trellis ouput values
> + *     vals       - Input value that led to each state
> + */
> +struct vtrellis {
> +       int num_states;
> +       int16_t *sums;
> +       int16_t *outputs;
> +       uint8_t *vals;
> +};
> +
> +/* Viterbi Decoder
> + *     n         - Code order
> + *     k         - Constraint length
> + *     len       - Horizontal length of trellis
> + *     recursive - Set to '1' if the code is recursive
> + *     intrvl    - Normalization interval
> + *     trellis   - Trellis object
> + *     punc      - Puncturing sequence
> + *     paths     - Trellis paths
> + */
> +struct vdecoder {
> +       int n;
> +       int k;
> +       int len;
> +       int recursive;
> +       int intrvl;
> +       struct vtrellis *trellis;
> +       int *punc;
> +       int16_t **paths;
> +
> +       void (*metric_func)(const int8_t *, const int16_t *,
> +                           int16_t *, int16_t *, int);
> +};
> +
> +/* Aligned Memory Allocator
> + *     SSE requires 16-byte memory alignment. We store relevant trellis values
> + *     (accumulated sums, outputs, and path decisions) as 16 bit signed integers
> + *     so the allocated memory is casted as such.
> + */
> +#define SSE_ALIGN      16
> +
> +static int16_t *vdec_malloc(size_t n)
> +{
> +#ifdef HAVE_SSE3
> +       return (int16_t *) memalign(SSE_ALIGN, sizeof(int16_t) * n);
> +#else
> +       return (int16_t *) malloc(sizeof(int16_t) * n);
> +#endif
> +}
> +
> +/* Accessor calls */
> +inline int conv_code_recursive(const struct osmo_conv_code *code)
> +{
> +       return code->next_term_output ? 1 : 0;
> +}
> +
> +/* Left shift and mask for finding the previous state */
> +static unsigned vstate_lshift(unsigned reg, int k, int val)
> +{
> +       unsigned mask;
> +
> +       if (k == 5)
> +               mask = 0x0e;
> +       else if (k == 7)
> +               mask = 0x3e;
> +       else
> +               mask = 0;
> +
> +       return ((reg << 1) & mask) | val;
> +}
> +
> +/* Bit endian manipulators */
> +inline unsigned bitswap2(unsigned v)
> +{
> +       return ((v & 0x02) >> 1) | ((v & 0x01) << 1);
> +}
> +
> +inline unsigned bitswap3(unsigned v)
> +{
> +       return ((v & 0x04) >> 2) | ((v & 0x02) >> 0) |
> +              ((v & 0x01) << 2);
> +}
> +
> +inline unsigned bitswap4(unsigned v)
> +{
> +       return ((v & 0x08) >> 3) | ((v & 0x04) >> 1) |
> +              ((v & 0x02) << 1) | ((v & 0x01) << 3);
> +}
> +
> +inline unsigned bitswap5(unsigned v)
> +{
> +       return ((v & 0x10) >> 4) | ((v & 0x08) >> 2) | ((v & 0x04) >> 0) |
> +              ((v & 0x02) << 2) | ((v & 0x01) << 4);
> +}
> +
> +inline unsigned bitswap6(unsigned v)
> +{
> +       return ((v & 0x20) >> 5) | ((v & 0x10) >> 3) | ((v & 0x08) >> 1) |
> +              ((v & 0x04) << 1) | ((v & 0x02) << 3) | ((v & 0x01) << 5);
> +}
> +
> +static unsigned bitswap(unsigned v, unsigned n)
> +{
> +       switch (n) {
> +       case 1:
> +               return v;
> +       case 2:
> +               return bitswap2(v);
> +       case 3:
> +               return bitswap3(v);
> +       case 4:
> +               return bitswap4(v);
> +       case 5:
> +               return bitswap5(v);
> +       case 6:
> +               return bitswap6(v);
> +       default:
> +               break;
> +       }
> +
> +       return 0;
> +}
> +
> +/* Generate non-recursive state output from generator state table
> + *     Note that the shift register moves right (i.e. the most recent bit is
> + *     shifted into the register at k-1 bit of the register), which is typical
> + *     textbook representation. The API transition table expects the most recent
> + *     bit in the low order bit, or left shift. A bitswap operation is required
> + *     to accommodate the difference.
> + */
> +static unsigned gen_output(struct vstate *state, int val,
> +                          const struct osmo_conv_code *code)
> +{
> +       unsigned out, prev;
> +
> +       prev = bitswap(state->prev[0], code->K - 1);
> +       out = code->next_output[prev][val];
> +       out = bitswap(out, code->N);
> +
> +       return out;
> +}
> +
> +#define BIT2NRZ(REG,N) (((REG >> N) & 0x01) * 2 - 1) * -1
> +
> +/* Populate non-recursive trellis state
> + *     For a given state defined by the k-1 length shift register, find the
> + *     value of the input bit that drove the trellis to that state. Also
> + *     generate the N outputs of the generator polynomial at that state.
> + */
> +static int gen_state_info(uint8_t *val, unsigned reg,
> +                         int16_t *output, const struct osmo_conv_code *code)
> +{
> +       int i;
> +       unsigned out;
> +       struct vstate state;
> +
> +       /* Previous '0' state */
> +       state.state = reg;
> +       state.prev[0] = vstate_lshift(reg, code->K, 0);
> +       state.prev[1] = vstate_lshift(reg, code->K, 1);
> +
> +       *val = (reg >> (code->K - 2)) & 0x01;
> +
> +       /* Transition output */
> +       out = gen_output(&state, *val, code);
> +
> +       /* Unpack to NRZ */
> +       for (i = 0; i < code->N; i++)
> +               output[i] = BIT2NRZ(out, i);
> +
> +       return 0;
> +}
> +
> +/* Generate recursive state output from generator state table */
> +static unsigned gen_recursive_output(struct vstate *state,
> +                                    uint8_t *val, unsigned reg,
> +                                    const struct osmo_conv_code *code, int pos)
> +{
> +       int val0, val1;
> +       unsigned out, prev;
> +
> +       /* Previous '0' state */
> +       prev = vstate_lshift(reg, code->K, 0);
> +       prev = bitswap(prev, code->K - 1);
> +
> +       /* Input value */
> +       val0 = (reg >> (code->K - 2)) & 0x01;
> +       val1 = (code->next_term_output[prev] >> pos) & 0x01;
> +       *val = val0 == val1 ? 0 : 1;
> +
> +       /* Wrapper for osmocom state access */
> +       prev = bitswap(state->prev[0], code->K - 1);
> +
> +       /* Compute the transition output */
> +       out = code->next_output[prev][*val];
> +       out = bitswap(out, code->N);
> +
> +       return out;
> +}
> +
> +#define NUM_STATES(K) (K == 7 ? 64 : 16)
> +
> +/* Populate recursive trellis state
> + *     The bit position of the systematic bit is not explicitly marked by the
> + *     API, so it must be extracted from the generator table. Otherwise,
> + *     populate the trellis similar to the non-recursive version.
> + *     Non-systematic recursive codes are not supported.
> + */
> +static int gen_recursive_state_info(uint8_t *val,
> +                                   unsigned reg,
> +                                   int16_t *output,
> +                                   const struct osmo_conv_code *code)
> +{
> +       int i, j, pos = -1;
> +       int ns = NUM_STATES(code->K);
> +       unsigned out;
> +       struct vstate state;
> +
> +       /* Previous '0' and '1' states */
> +       state.state = reg;
> +       state.prev[0] = vstate_lshift(reg, code->K, 0);
> +       state.prev[1] = vstate_lshift(reg, code->K, 1);
> +
> +       /* Find recursive bit location */
> +       for (i = 0; i < code->N; i++) {
> +               for (j = 0; j < ns; j++) {
> +                       if ((code->next_output[j][0] >> i) & 0x01)
> +                               break;
> +               }
> +               if (j == ns) {
> +                       pos = i;
> +                       break;
> +               }
> +       }
> +
> +       /* Non-systematic recursive code not supported */
> +       if (pos < 0)
> +               return -EPROTO;
> +
> +       /* Transition output */
> +       out = gen_recursive_output(&state, val, reg, code, pos);
> +
> +       /* Unpack to NRZ */
> +       for (i = 0; i < code->N; i++)
> +               output[i] = BIT2NRZ(out, i);
> +
> +       return 0;
> +}
> +
> +/* Release the trellis */
> +static void free_trellis(struct vtrellis *trellis)
> +{
> +       if (!trellis)
> +               return;
> +
> +       free(trellis->vals);
> +       free(trellis->outputs);
> +       free(trellis->sums);
> +       free(trellis);
> +}
> +
> +/* Allocate and initialize the trellis object
> + *     Initialization consists of generating the outputs and output value of a
> + *     given state. Due to trellis symmetry and anti-symmetry, only one of the
> + *     transition paths is utilized by the butterfly operation in the forward
> + *     recursion, so only one set of N outputs is required per state variable.
> + */
> +static struct vtrellis *generate_trellis(const struct osmo_conv_code *code)
> +{
> +       int i, rc = -1;
> +       struct vtrellis *trellis;
> +       int16_t *outputs;
> +
> +       int ns = NUM_STATES(code->K);
> +       int recursive = conv_code_recursive(code);
> +       int olen = (code->N == 2) ? 2 : 4;
> +
> +       trellis = (struct vtrellis *) calloc(1, sizeof(struct vtrellis));
> +       trellis->num_states = ns;
> +       trellis->sums = vdec_malloc(ns);
> +       trellis->outputs = vdec_malloc(ns * olen);
> +       trellis->vals = (uint8_t *) malloc(ns * sizeof(uint8_t));
> +
> +       if (!trellis->sums || !trellis->outputs)
> +               goto fail;
> +
> +       /* Populate the trellis state objects */
> +       for (i = 0; i < ns; i++) {
> +               outputs = &trellis->outputs[olen * i];
> +
> +               if (recursive)
> +                       rc = gen_recursive_state_info(&trellis->vals[i],
> +                                                     i, outputs, code);
> +               else
> +                       rc = gen_state_info(&trellis->vals[i],
> +                                           i, outputs, code);
> +       }
> +
> +       if (rc < 0)
> +               goto fail;
> +
> +       return trellis;
> +fail:
> +       free_trellis(trellis);
> +       return NULL;
> +}
> +
> +/* Reset decoder
> + *     Set accumulated path metrics to zero. For termination other than
> + *     tail-biting, initialize the zero state as the encoder starting state.
> + *     Intialize with the maximum accumulated sum at length equal to the
> + *     constraint length.
> + */
> +static void reset_decoder(struct vdecoder *dec, int term)
> +{
> +       int ns = dec->trellis->num_states;
> +
> +       memset(dec->trellis->sums, 0, sizeof(int16_t) * ns);
> +
> +       if (term != CONV_TERM_TAIL_BITING)
> +               dec->trellis->sums[0] = INT8_MAX * dec->n * dec->k;
> +}
> +
> +static void _traceback(struct vdecoder *dec,
> +                      unsigned state, uint8_t *out, int len)
> +{
> +       int i;
> +       unsigned path;
> +
> +       for (i = len - 1; i >= 0; i--) {
> +               path = dec->paths[i][state] + 1;
> +               out[i] = dec->trellis->vals[state];
> +               state = vstate_lshift(state, dec->k, path);
> +       }
> +}
> +
> +static void _traceback_rec(struct vdecoder *dec,
> +                          unsigned state, uint8_t *out, int len)
> +{
> +       int i;
> +       unsigned path;
> +
> +       for (i = len - 1; i >= 0; i--) {
> +               path = dec->paths[i][state] + 1;
> +               out[i] = path ^ dec->trellis->vals[state];
> +               state = vstate_lshift(state, dec->k, path);
> +       }
> +}
> +
> +/* Traceback and generate decoded output
> + *     Find the largest accumulated path metric at the final state except for
> + *     the zero terminated case, where we assume the final state is always zero.
> + */
> +static int traceback(struct vdecoder *dec, uint8_t *out, int term, int len)
> +{
> +       int i, sum, max = -1;
> +       unsigned path, state = 0;
> +
> +       if (term != CONV_TERM_FLUSH) {
> +               for (i = 0; i < dec->trellis->num_states; i++) {
> +                       sum = dec->trellis->sums[i];
> +                       if (sum > max) {
> +                               max = sum;
> +                               state = i;
> +                       }
> +               }
> +
> +               if (max < 0)
> +                       return -EPROTO;
> +       }
> +
> +       for (i = dec->len - 1; i >= len; i--) {
> +               path = dec->paths[i][state] + 1;
> +               state = vstate_lshift(state, dec->k, path);
> +       }
> +
> +       if (dec->recursive)
> +               _traceback_rec(dec, state, out, len);
> +       else
> +               _traceback(dec, state, out, len);
> +
> +       return 0;
> +}
> +
> +/* Release decoder object */
> +static void free_vdec(struct vdecoder *dec)
> +{
> +       if (!dec)
> +               return;
> +
> +       free(dec->paths[0]);
> +       free(dec->paths);
> +       free_trellis(dec->trellis);
> +       free(dec);
> +}
> +
> +/* Allocate decoder object
> + *     Subtract the constraint length K on the normalization interval to
> + *     accommodate the initialization path metric at state zero.
> + */
> +static struct vdecoder *alloc_vdec(const struct osmo_conv_code *code)
> +{
> +       int i, ns;
> +       struct vdecoder *dec;
> +
> +       ns = NUM_STATES(code->K);
> +
> +       dec = (struct vdecoder *) calloc(1, sizeof(struct vdecoder));
> +       dec->n = code->N;
> +       dec->k = code->K;
> +       dec->recursive = conv_code_recursive(code);
> +       dec->intrvl = INT16_MAX / (dec->n * INT8_MAX) - dec->k;
> +
> +       if (dec->k == 5) {
> +               switch (dec->n) {
> +               case 2:
> +                       dec->metric_func = gen_metrics_k5_n2;
> +                       break;
> +               case 3:
> +                       dec->metric_func = gen_metrics_k5_n3;
> +                       break;
> +               case 4:
> +                       dec->metric_func = gen_metrics_k5_n4;
> +                       break;
> +               default:
> +                       goto fail;
> +               }
> +       } else if (dec->k == 7) {
> +               switch (dec->n) {
> +               case 2:
> +                       dec->metric_func = gen_metrics_k7_n2;
> +                       break;
> +               case 3:
> +                       dec->metric_func = gen_metrics_k7_n3;
> +                       break;
> +               case 4:
> +                       dec->metric_func = gen_metrics_k7_n4;
> +                       break;
> +               default:
> +                       goto fail;
> +               }
> +       } else {
> +               goto fail;
> +       }
> +
> +       if (code->term == CONV_TERM_FLUSH)
> +               dec->len = code->len + code->K - 1;
> +       else
> +               dec->len = code->len;
> +
> +       dec->trellis = generate_trellis(code);
> +       if (!dec->trellis)
> +               goto fail;
> +
> +       dec->paths = (int16_t **) malloc(sizeof(int16_t *) * dec->len);
> +       dec->paths[0] = vdec_malloc(ns * dec->len);
> +       for (i = 1; i < dec->len; i++)
> +               dec->paths[i] = &dec->paths[0][i * ns];
> +
> +       return dec;
> +fail:
> +       free_vdec(dec);
> +       return NULL;
> +}
> +
> +/* Depuncture sequence with nagative value terminated puncturing matrix */
> +static int depuncture(const int8_t *in, const int *punc, int8_t *out, int len)
> +{
> +       int i, n = 0, m = 0;
> +
> +       for (i = 0; i < len; i++) {
> +               if (i == punc[n]) {
> +                       out[i] = 0;
> +                       n++;
> +                       continue;
> +               }
> +
> +               out[i] = in[m++];
> +       }
> +
> +       return 0;
> +}
> +
> +/* Forward trellis recursion
> + *     Generate branch metrics and path metrics with a combined function. Only
> + *     accumulated path metric sums and path selections are stored. Normalize on
> + *     the interval specified by the decoder.
> + */
> +static void _conv_decode(struct vdecoder *dec, const int8_t *seq, int _cnt)
> +{
> +       int i, len = dec->len;
> +       struct vtrellis *trellis = dec->trellis;
> +
> +       for (i = 0; i < len; i++) {
> +               dec->metric_func(&seq[dec->n * i],
> +                                trellis->outputs,
> +                                trellis->sums,
> +                                dec->paths[i],
> +                                !(i % dec->intrvl));
> +       }
> +}
> +
> +/* Convolutional decode with a decoder object
> + *     Initial puncturing run if necessary followed by the forward recursion.
> + *     For tail-biting perform a second pass before running the backward
> + *     traceback operation.
> + */
> +static int conv_decode(struct vdecoder *dec, const int8_t *seq,
> +                      const int *punc, uint8_t *out, int len, int term)
> +{
> +       int cnt = 0;
> +       int8_t depunc[dec->len * dec->n];
> +
> +       reset_decoder(dec, term);
> +
> +       if (punc) {
> +               depuncture(seq, punc, depunc, dec->len * dec->n);
> +               seq = depunc;
> +       }
> +
> +       /* Propagate through the trellis with interval normalization */
> +       _conv_decode(dec, seq, cnt);
> +
> +       if (term == CONV_TERM_TAIL_BITING)
> +               _conv_decode(dec, seq, cnt);
> +
> +       return traceback(dec, out, term, len);
> +}
> +
> +/* All-in-one viterbi decoding  */
> +int osmo_conv_decode_acc(const struct osmo_conv_code *code,
> +                        const sbit_t *input, ubit_t *output)
> +{
> +       int rc;
> +       struct vdecoder *vdec;
> +
> +       if ((code->N < 2) || (code->N > 4) || (code->len < 1) ||
> +           ((code->K != 5) && (code->K != 7)))
> +               return -EINVAL;
> +
> +       vdec = alloc_vdec(code);
> +       if (!vdec)
> +               return -EFAULT;
> +
> +       rc = conv_decode(vdec, input, code->puncture,
> +                        output, code->len, code->term);
> +
> +       free_vdec(vdec);
> +
> +       return rc;
> +}
> diff --git a/src/viterbi_gen.c b/src/viterbi_gen.c
> new file mode 100644
> index 0000000..894d5ae
> --- /dev/null
> +++ b/src/viterbi_gen.c
> @@ -0,0 +1,182 @@
> +/*
> + * Viterbi decoder
> + * Copyright (C) 2013, 2014 Thomas Tsou <tom at tsou.cc>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +#include <string.h>
> +
> +/* Add-Compare-Select (ACS-Butterfly)
> + *     Compute 4 accumulated path metrics and 4 path selections. Note that path
> + *     selections are store as -1 and 0 rather than 0 and 1. This is to match
> + *     the output format of the SSE packed compare instruction 'pmaxuw'.
> + */
> +void acs_butterfly(int state, int num_states,
> +                  int16_t metric, int16_t *sum,
> +                  int16_t *new_sum, int16_t *path)
> +{
> +       int state0, state1;
> +       int sum0, sum1, sum2, sum3;
> +
> +       state0 = *(sum + (2 * state + 0));
> +       state1 = *(sum + (2 * state + 1));
> +
> +       sum0 = state0 + metric;
> +       sum1 = state1 - metric;
> +       sum2 = state0 - metric;
> +       sum3 = state1 + metric;
> +
> +       if (sum0 > sum1) {
> +               *new_sum = sum0;
> +               *path = -1;
> +       } else {
> +               *new_sum = sum1;
> +               *path = 0;
> +       }
> +
> +       if (sum2 > sum3) {
> +               *(new_sum + num_states / 2) = sum2;
> +               *(path + num_states / 2) = -1;
> +       } else {
> +               *(new_sum + num_states / 2) = sum3;
> +               *(path + num_states / 2) = 0;
> +       }
> +}
> +
> +/* Branch metrics unit N=2 */
> +void _gen_branch_metrics_n2(int num_states, const int8_t *seq,
> +                           const int16_t *out, int16_t *metrics)
> +{
> +       int i;
> +
> +       for (i = 0; i < num_states / 2; i++) {
> +               metrics[i] = seq[0] * out[2 * i + 0] +
> +                            seq[1] * out[2 * i + 1];
> +       }
> +}
> +
> +/* Branch metrics unit N=3 */
> +void _gen_branch_metrics_n3(int num_states, const int8_t *seq,
> +                           const int16_t *out, int16_t *metrics)
> +{
> +       int i;
> +
> +       for (i = 0; i < num_states / 2; i++)
> +               metrics[i] = seq[0] * out[4 * i + 0] +
> +                            seq[1] * out[4 * i + 1] +
> +                            seq[2] * out[4 * i + 2];
> +}
> +
> +/* Branch metrics unit N=4 */
> +void _gen_branch_metrics_n4(int num_states, const int8_t *seq,
> +                           const int16_t *out, int16_t *metrics)
> +{
> +       int i;
> +
> +       for (i = 0; i < num_states / 2; i++)
> +               metrics[i] = seq[0] * out[4 * i + 0] +
> +                            seq[1] * out[4 * i + 1] +
> +                            seq[2] * out[4 * i + 2] +
> +                            seq[3] * out[4 * i + 3];
> +}
> +
> +/* Path metric unit */
> +void _gen_path_metrics(int num_states, int16_t *sums,
> +                      int16_t *metrics, int16_t *paths, int norm)
> +{
> +       int i;
> +       int16_t min;
> +       int16_t new_sums[num_states];
> +
> +       for (i = 0; i < num_states / 2; i++) {
> +               acs_butterfly(i, num_states, metrics[i],
> +                             sums, &new_sums[i], &paths[i]);
> +       }
> +
> +       if (norm) {
> +               min = new_sums[0];
> +               for (i = 1; i < num_states; i++) {
> +                       if (new_sums[i] < min)
> +                               min = new_sums[i];
> +               }
> +
> +               for (i = 0; i < num_states; i++)
> +                       new_sums[i] -= min;
> +       }
> +
> +       memcpy(sums, new_sums, num_states * sizeof(int16_t));
> +}
> +
> +/* 16-state branch-path metrics units (K=5) */
> +void gen_metrics_k5_n2(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[8];
> +
> +       _gen_branch_metrics_n2(16, seq, out, metrics);
> +       _gen_path_metrics(16, sums, metrics, paths, norm);
> +}
> +
> +void gen_metrics_k5_n3(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[8];
> +
> +       _gen_branch_metrics_n3(16, seq, out, metrics);
> +       _gen_path_metrics(16, sums, metrics, paths, norm);
> +
> +}
> +
> +void gen_metrics_k5_n4(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[8];
> +
> +       _gen_branch_metrics_n4(16, seq, out, metrics);
> +       _gen_path_metrics(16, sums, metrics, paths, norm);
> +
> +}
> +
> +/* 64-state branch-path metrics units (K=7) */
> +void gen_metrics_k7_n2(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[32];
> +
> +       _gen_branch_metrics_n2(64, seq, out, metrics);
> +       _gen_path_metrics(64, sums, metrics, paths, norm);
> +
> +}
> +
> +void gen_metrics_k7_n3(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[32];
> +
> +       _gen_branch_metrics_n3(64, seq, out, metrics);
> +       _gen_path_metrics(64, sums, metrics, paths, norm);
> +
> +}
> +
> +void gen_metrics_k7_n4(const int8_t *seq, const int16_t *out,
> +                      int16_t *sums, int16_t *paths, int norm)
> +{
> +       int16_t metrics[32];
> +
> +       _gen_branch_metrics_n4(64, seq, out, metrics);
> +       _gen_path_metrics(64, sums, metrics, paths, norm);
> +}
> --
> 1.9.0
>