diff options
Diffstat (limited to 'lib/ffmpeg/libavcodec/aacpsy.c')
-rw-r--r-- | lib/ffmpeg/libavcodec/aacpsy.c | 389 |
1 files changed, 344 insertions, 45 deletions
diff --git a/lib/ffmpeg/libavcodec/aacpsy.c b/lib/ffmpeg/libavcodec/aacpsy.c index 466b0e9a1a..a987be0abb 100644 --- a/lib/ffmpeg/libavcodec/aacpsy.c +++ b/lib/ffmpeg/libavcodec/aacpsy.c @@ -39,11 +39,19 @@ * constants for 3GPP AAC psychoacoustic model * @{ */ -#define PSY_3GPP_SPREAD_LOW 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark) -#define PSY_3GPP_SPREAD_HI 3.0f // spreading factor for descending threshold spreading (30 dB/Bark) +#define PSY_3GPP_SPREAD_HI 1.5f // spreading factor for ascending threshold spreading (15 dB/Bark) +#define PSY_3GPP_SPREAD_LOW 3.0f // spreading factor for descending threshold spreading (30 dB/Bark) #define PSY_3GPP_RPEMIN 0.01f #define PSY_3GPP_RPELEV 2.0f + +/* LAME psy model constants */ +#define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order +#define AAC_BLOCK_SIZE_LONG 1024 ///< long block size +#define AAC_BLOCK_SIZE_SHORT 128 ///< short block size +#define AAC_NUM_BLOCKS_SHORT 8 ///< number of blocks in a short sequence +#define PSY_LAME_NUM_SUBBLOCKS 3 ///< Number of sub-blocks in each short block + /** * @} */ @@ -51,44 +59,156 @@ /** * information for single band used by 3GPP TS26.403-inspired psychoacoustic model */ -typedef struct Psy3gppBand{ +typedef struct AacPsyBand{ float energy; ///< band energy float ffac; ///< form factor float thr; ///< energy threshold float min_snr; ///< minimal SNR float thr_quiet; ///< threshold in quiet -}Psy3gppBand; +}AacPsyBand; /** * single/pair channel context for psychoacoustic model */ -typedef struct Psy3gppChannel{ - Psy3gppBand band[128]; ///< bands information - Psy3gppBand prev_band[128]; ///< bands information from the previous frame +typedef struct AacPsyChannel{ + AacPsyBand band[128]; ///< bands information + AacPsyBand prev_band[128]; ///< bands information from the previous frame float win_energy; ///< sliding average of channel energy float iir_state[2]; ///< hi-pass IIR filter state uint8_t next_grouping; ///< stored grouping scheme for the next frame (in case of 8 short window sequence) enum WindowSequence next_window_seq; ///< window sequence to be used in the next frame -}Psy3gppChannel; + /* LAME psy model specific members */ + float attack_threshold; ///< attack threshold for this channel + float prev_energy_subshort[AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS]; + int prev_attack; ///< attack value for the last short block in the previous sequence +}AacPsyChannel; /** * psychoacoustic model frame type-dependent coefficients */ -typedef struct Psy3gppCoeffs{ +typedef struct AacPsyCoeffs{ float ath [64]; ///< absolute threshold of hearing per bands float barks [64]; ///< Bark value for each spectral band in long frame float spread_low[64]; ///< spreading factor for low-to-high threshold spreading in long frame float spread_hi [64]; ///< spreading factor for high-to-low threshold spreading in long frame -}Psy3gppCoeffs; +}AacPsyCoeffs; /** * 3GPP TS26.403-inspired psychoacoustic model specific data */ -typedef struct Psy3gppContext{ - Psy3gppCoeffs psy_coef[2]; - Psy3gppChannel *ch; -}Psy3gppContext; +typedef struct AacPsyContext{ + AacPsyCoeffs psy_coef[2]; + AacPsyChannel *ch; +}AacPsyContext; + +/** + * LAME psy model preset struct + */ +typedef struct { + int quality; ///< Quality to map the rest of the vaules to. + /* This is overloaded to be both kbps per channel in ABR mode, and + * requested quality in constant quality mode. + */ + float st_lrm; ///< short threshold for L, R, and M channels +} PsyLamePreset; + +/** + * LAME psy model preset table for ABR + */ +static const PsyLamePreset psy_abr_map[] = { +/* TODO: Tuning. These were taken from LAME. */ +/* kbps/ch st_lrm */ + { 8, 6.60}, + { 16, 6.60}, + { 24, 6.60}, + { 32, 6.60}, + { 40, 6.60}, + { 48, 6.60}, + { 56, 6.60}, + { 64, 6.40}, + { 80, 6.00}, + { 96, 5.60}, + {112, 5.20}, + {128, 5.20}, + {160, 5.20} +}; + +/** +* LAME psy model preset table for constant quality +*/ +static const PsyLamePreset psy_vbr_map[] = { +/* vbr_q st_lrm */ + { 0, 4.20}, + { 1, 4.20}, + { 2, 4.20}, + { 3, 4.20}, + { 4, 4.20}, + { 5, 4.20}, + { 6, 4.20}, + { 7, 4.20}, + { 8, 4.20}, + { 9, 4.20}, + {10, 4.20} +}; + +/** + * LAME psy model FIR coefficient table + */ +static const float psy_fir_coeffs[] = { + -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2, + -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2, + -5.52212e-17 * 2, -0.313819 * 2 +}; + +/** + * calculates the attack threshold for ABR from the above table for the LAME psy model + */ +static float lame_calc_attack_threshold(int bitrate) +{ + /* Assume max bitrate to start with */ + int lower_range = 12, upper_range = 12; + int lower_range_kbps = psy_abr_map[12].quality; + int upper_range_kbps = psy_abr_map[12].quality; + int i; + + /* Determine which bitrates the value specified falls between. + * If the loop ends without breaking our above assumption of 320kbps was correct. + */ + for (i = 1; i < 13; i++) { + if (FFMAX(bitrate, psy_abr_map[i].quality) != bitrate) { + upper_range = i; + upper_range_kbps = psy_abr_map[i ].quality; + lower_range = i - 1; + lower_range_kbps = psy_abr_map[i - 1].quality; + break; /* Upper range found */ + } + } + + /* Determine which range the value specified is closer to */ + if ((upper_range_kbps - bitrate) > (bitrate - lower_range_kbps)) + return psy_abr_map[lower_range].st_lrm; + return psy_abr_map[upper_range].st_lrm; +} + +/** + * LAME psy model specific initialization + */ +static void lame_window_init(AacPsyContext *ctx, AVCodecContext *avctx) { + int i, j; + + for (i = 0; i < avctx->channels; i++) { + AacPsyChannel *pch = &ctx->ch[i]; + + if (avctx->flags & CODEC_FLAG_QSCALE) + pch->attack_threshold = psy_vbr_map[avctx->global_quality / FF_QP2LAMBDA].st_lrm; + else + pch->attack_threshold = lame_calc_attack_threshold(avctx->bit_rate / avctx->channels / 1000); + + for (j = 0; j < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; j++) + pch->prev_energy_subshort[j] = 10.0f; + } +} /** * Calculate Bark value for given line. @@ -113,25 +233,25 @@ static av_cold float ath(float f, float add) } static av_cold int psy_3gpp_init(FFPsyContext *ctx) { - Psy3gppContext *pctx; - float barks[1024]; + AacPsyContext *pctx; + float bark; int i, j, g, start; float prev, minscale, minath; - ctx->model_priv_data = av_mallocz(sizeof(Psy3gppContext)); - pctx = (Psy3gppContext*) ctx->model_priv_data; + ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext)); + pctx = (AacPsyContext*) ctx->model_priv_data; - for (i = 0; i < 1024; i++) - barks[i] = calc_bark(i * ctx->avctx->sample_rate / 2048.0); minath = ath(3410, ATH_ADD); for (j = 0; j < 2; j++) { - Psy3gppCoeffs *coeffs = &pctx->psy_coef[j]; + AacPsyCoeffs *coeffs = &pctx->psy_coef[j]; + float line_to_frequency = ctx->avctx->sample_rate / (j ? 256.f : 2048.0f); i = 0; prev = 0.0; for (g = 0; g < ctx->num_bands[j]; g++) { i += ctx->bands[j][g]; - coeffs->barks[g] = (barks[i - 1] + prev) / 2.0; - prev = barks[i - 1]; + bark = calc_bark((i-1) * line_to_frequency); + coeffs->barks[g] = (bark + prev) / 2.0; + prev = bark; } for (g = 0; g < ctx->num_bands[j] - 1; g++) { coeffs->spread_low[g] = pow(10.0, -(coeffs->barks[g+1] - coeffs->barks[g]) * PSY_3GPP_SPREAD_LOW); @@ -139,15 +259,18 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) { } start = 0; for (g = 0; g < ctx->num_bands[j]; g++) { - minscale = ath(ctx->avctx->sample_rate * start / 1024.0, ATH_ADD); + minscale = ath(start * line_to_frequency, ATH_ADD); for (i = 1; i < ctx->bands[j][g]; i++) - minscale = FFMIN(minscale, ath(ctx->avctx->sample_rate * (start + i) / 1024.0 / 2.0, ATH_ADD)); + minscale = FFMIN(minscale, ath((start + i) * line_to_frequency, ATH_ADD)); coeffs->ath[g] = minscale - minath; start += ctx->bands[j][g]; } } - pctx->ch = av_mallocz(sizeof(Psy3gppChannel) * ctx->avctx->channels); + pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels); + + lame_window_init(pctx, ctx->avctx); + return 0; } @@ -182,8 +305,8 @@ static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx, int i, j; int br = ctx->avctx->bit_rate / ctx->avctx->channels; int attack_ratio = br <= 16000 ? 18 : 10; - Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data; - Psy3gppChannel *pch = &pctx->ch[channel]; + AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data; + AacPsyChannel *pch = &pctx->ch[channel]; uint8_t grouping = 0; int next_type = pch->next_window_seq; FFPsyWindowInfo wi; @@ -264,24 +387,23 @@ static FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx, * Calculate band thresholds as suggested in 3GPP TS26.403 */ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, - const float *coefs, FFPsyWindowInfo *wi) + const float *coefs, const FFPsyWindowInfo *wi) { - Psy3gppContext *pctx = (Psy3gppContext*) ctx->model_priv_data; - Psy3gppChannel *pch = &pctx->ch[channel]; + AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data; + AacPsyChannel *pch = &pctx->ch[channel]; int start = 0; int i, w, g; const int num_bands = ctx->num_bands[wi->num_windows == 8]; const uint8_t* band_sizes = ctx->bands[wi->num_windows == 8]; - Psy3gppCoeffs *coeffs = &pctx->psy_coef[wi->num_windows == 8]; + AacPsyCoeffs *coeffs = &pctx->psy_coef[wi->num_windows == 8]; //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation" for (w = 0; w < wi->num_windows*16; w += 16) { for (g = 0; g < num_bands; g++) { - Psy3gppBand *band = &pch->band[w+g]; + AacPsyBand *band = &pch->band[w+g]; band->energy = 0.0f; for (i = 0; i < band_sizes[g]; i++) band->energy += coefs[start+i] * coefs[start+i]; - band->energy *= 1.0f / (512*512); band->thr = band->energy * 0.001258925f; start += band_sizes[g]; @@ -290,18 +412,16 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, } //modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation" for (w = 0; w < wi->num_windows*16; w += 16) { - Psy3gppBand *band = &pch->band[w]; + AacPsyBand *band = &pch->band[w]; for (g = 1; g < num_bands; g++) - band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_low[g-1]); + band[g].thr = FFMAX(band[g].thr, band[g-1].thr * coeffs->spread_hi [g]); for (g = num_bands - 2; g >= 0; g--) - band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_hi [g]); + band[g].thr = FFMAX(band[g].thr, band[g+1].thr * coeffs->spread_low[g]); for (g = 0; g < num_bands; g++) { - band[g].thr_quiet = FFMAX(band[g].thr, coeffs->ath[g]); - if (wi->num_windows != 8 && wi->window_type[1] != EIGHT_SHORT_SEQUENCE) - band[g].thr_quiet = FFMAX(PSY_3GPP_RPEMIN*band[g].thr_quiet, - FFMIN(band[g].thr_quiet, - PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet)); - band[g].thr = FFMAX(band[g].thr, band[g].thr_quiet * 0.25); + band[g].thr_quiet = band[g].thr = FFMAX(band[g].thr, coeffs->ath[g]); + if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (wi->window_type[1] == LONG_START_SEQUENCE && !w))) + band[g].thr = FFMAX(PSY_3GPP_RPEMIN*band[g].thr, FFMIN(band[g].thr, + PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet)); ctx->psy_bands[channel*PSY_MAX_BANDS+w+g].threshold = band[g].thr; } @@ -311,17 +431,196 @@ static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, static av_cold void psy_3gpp_end(FFPsyContext *apc) { - Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data; + AacPsyContext *pctx = (AacPsyContext*) apc->model_priv_data; av_freep(&pctx->ch); av_freep(&apc->model_priv_data); } +static void lame_apply_block_type(AacPsyChannel *ctx, FFPsyWindowInfo *wi, int uselongblock) +{ + int blocktype = ONLY_LONG_SEQUENCE; + if (uselongblock) { + if (ctx->next_window_seq == EIGHT_SHORT_SEQUENCE) + blocktype = LONG_STOP_SEQUENCE; + } else { + blocktype = EIGHT_SHORT_SEQUENCE; + if (ctx->next_window_seq == ONLY_LONG_SEQUENCE) + ctx->next_window_seq = LONG_START_SEQUENCE; + if (ctx->next_window_seq == LONG_STOP_SEQUENCE) + ctx->next_window_seq = EIGHT_SHORT_SEQUENCE; + } + + wi->window_type[0] = ctx->next_window_seq; + ctx->next_window_seq = blocktype; +} + +static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, + const int16_t *audio, const int16_t *la, + int channel, int prev_type) +{ + AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data; + AacPsyChannel *pch = &pctx->ch[channel]; + int grouping = 0; + int uselongblock = 1; + int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 }; + int i; + FFPsyWindowInfo wi; + + memset(&wi, 0, sizeof(wi)); + if (la) { + float hpfsmpl[AAC_BLOCK_SIZE_LONG]; + float const *pf = hpfsmpl; + float attack_intensity[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS]; + float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS]; + float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 }; + int chans = ctx->avctx->channels; + const int16_t *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN) * chans; + int j, att_sum = 0; + + /* LAME comment: apply high pass filter of fs/4 */ + for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) { + float sum1, sum2; + sum1 = firbuf[(i + ((PSY_LAME_FIR_LEN - 1) / 2)) * chans]; + sum2 = 0.0; + for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) { + sum1 += psy_fir_coeffs[j] * (firbuf[(i + j) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j) * chans]); + sum2 += psy_fir_coeffs[j + 1] * (firbuf[(i + j + 1) * chans] + firbuf[(i + PSY_LAME_FIR_LEN - j - 1) * chans]); + } + hpfsmpl[i] = sum1 + sum2; + } + + /* Calculate the energies of each sub-shortblock */ + for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) { + energy_subshort[i] = pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 1) * PSY_LAME_NUM_SUBBLOCKS)]; + assert(pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)] > 0); + attack_intensity[i] = energy_subshort[i] / pch->prev_energy_subshort[i + ((AAC_NUM_BLOCKS_SHORT - 2) * PSY_LAME_NUM_SUBBLOCKS + 1)]; + energy_short[0] += energy_subshort[i]; + } + + for (i = 0; i < AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS; i++) { + float const *const pfe = pf + AAC_BLOCK_SIZE_LONG / (AAC_NUM_BLOCKS_SHORT * PSY_LAME_NUM_SUBBLOCKS); + float p = 1.0f; + for (; pf < pfe; pf++) + if (p < fabsf(*pf)) + p = fabsf(*pf); + pch->prev_energy_subshort[i] = energy_subshort[i + PSY_LAME_NUM_SUBBLOCKS] = p; + energy_short[1 + i / PSY_LAME_NUM_SUBBLOCKS] += p; + /* FIXME: The indexes below are [i + 3 - 2] in the LAME source. + * Obviously the 3 and 2 have some significance, or this would be just [i + 1] + * (which is what we use here). What the 3 stands for is ambigious, as it is both + * number of short blocks, and the number of sub-short blocks. + * It seems that LAME is comparing each sub-block to sub-block + 1 in the + * previous block. + */ + if (p > energy_subshort[i + 1]) + p = p / energy_subshort[i + 1]; + else if (energy_subshort[i + 1] > p * 10.0f) + p = energy_subshort[i + 1] / (p * 10.0f); + else + p = 0.0; + attack_intensity[i + PSY_LAME_NUM_SUBBLOCKS] = p; + } + + /* compare energy between sub-short blocks */ + for (i = 0; i < (AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS; i++) + if (!attacks[i / PSY_LAME_NUM_SUBBLOCKS]) + if (attack_intensity[i] > pch->attack_threshold) + attacks[i / PSY_LAME_NUM_SUBBLOCKS] = (i % PSY_LAME_NUM_SUBBLOCKS) + 1; + + /* should have energy change between short blocks, in order to avoid periodic signals */ + /* Good samples to show the effect are Trumpet test songs */ + /* GB: tuned (1) to avoid too many short blocks for test sample TRUMPET */ + /* RH: tuned (2) to let enough short blocks through for test sample FSOL and SNAPS */ + for (i = 1; i < AAC_NUM_BLOCKS_SHORT + 1; i++) { + float const u = energy_short[i - 1]; + float const v = energy_short[i]; + float const m = FFMAX(u, v); + if (m < 40000) { /* (2) */ + if (u < 1.7f * v && v < 1.7f * u) { /* (1) */ + if (i == 1 && attacks[0] < attacks[i]) + attacks[0] = 0; + attacks[i] = 0; + } + } + att_sum += attacks[i]; + } + + if (attacks[0] <= pch->prev_attack) + attacks[0] = 0; + + att_sum += attacks[0]; + /* 3 below indicates the previous attack happened in the last sub-block of the previous sequence */ + if (pch->prev_attack == 3 || att_sum) { + uselongblock = 0; + + if (attacks[1] && attacks[0]) + attacks[1] = 0; + if (attacks[2] && attacks[1]) + attacks[2] = 0; + if (attacks[3] && attacks[2]) + attacks[3] = 0; + if (attacks[4] && attacks[3]) + attacks[4] = 0; + if (attacks[5] && attacks[4]) + attacks[5] = 0; + if (attacks[6] && attacks[5]) + attacks[6] = 0; + if (attacks[7] && attacks[6]) + attacks[7] = 0; + if (attacks[8] && attacks[7]) + attacks[8] = 0; + } + } else { + /* We have no lookahead info, so just use same type as the previous sequence. */ + uselongblock = !(prev_type == EIGHT_SHORT_SEQUENCE); + } + + lame_apply_block_type(pch, &wi, uselongblock); + + wi.window_type[1] = prev_type; + if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) { + wi.num_windows = 1; + wi.grouping[0] = 1; + if (wi.window_type[0] == LONG_START_SEQUENCE) + wi.window_shape = 0; + else + wi.window_shape = 1; + } else { + int lastgrp = 0; + + wi.num_windows = 8; + wi.window_shape = 0; + for (i = 0; i < 8; i++) { + if (!((pch->next_grouping >> i) & 1)) + lastgrp = i; + wi.grouping[lastgrp]++; + } + } + + /* Determine grouping, based on the location of the first attack, and save for + * the next frame. + * FIXME: Move this to analysis. + * TODO: Tune groupings depending on attack location + * TODO: Handle more than one attack in a group + */ + for (i = 0; i < 9; i++) { + if (attacks[i]) { + grouping = i; + break; + } + } + pch->next_grouping = window_grouping[grouping]; + + pch->prev_attack = attacks[8]; + + return wi; +} const FFPsyModel ff_aac_psy_model = { .name = "3GPP TS 26.403-inspired model", .init = psy_3gpp_init, - .window = psy_3gpp_window, + .window = psy_lame_window, .analyze = psy_3gpp_analyze, .end = psy_3gpp_end, }; |