c++ audio ffmpeg libav transcoding

c++ - Transcodificación de audio FFMPEG utilizando librerías libav*



transcoding (1)

Descubrí dónde estaba el problema y se resolvió.

Cuando el archivo de salida se abrió con audacia, se vio que había silencios no deseados insertados en la señal de audio. El problema fue con el ''número de muestras por cuadro'' suministrado al codificador.

Los diferentes códecs esperan diferentes tamaños de fotogramas para la codificación. Y el codificador aac espera un tamaño de 1024. Esto puede verse observando enc_ctx->frame_size después de la ejecución de avcodec_open2() .

El filtro necesita suministrar un marco con 1024 muestras por canal al codificador. Entonces, en mi código, pFrameFiltered necesita tener exactamente 1024 muestras por canal. Si el valor es menos de 1024, el codificador agrega ceros para convertirlo en 1024 muestras y luego lo codifica.

Esto puede resolverse teniendo nuestra propia cola fifo o usando el filtro disponible con los filtros de audio ffmpeg. Necesitamos usar un filtro asetnsamples=n=1024:p=0 como se explica aquí . Entonces la alteración requerida era

`string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono,asetnsamples=n=1024:p=0";`

Simplemente juegue con el valor de n en el filtro para entender mejor. Compruebe el campo enc_ctx->frame_size establecido por avcodec_open2 () y establezca el valor de n adecuada.

Estoy escribiendo una aplicación de transcodificación de audio usando las librerías ffmpeg. Aquí está mi código

/* * File: main.cpp * Author: vinod * Compile with "g++ -std=c++11 -o audiotranscode main.cpp -lavformat -lavcodec -lavutil -lavfilter" * */ #if !defined PRId64 || PRI_MACROS_BROKEN #undef PRId64 #define PRId64 "lld" #endif #define __STDC_FORMAT_MACROS #ifdef __cplusplus extern "C" { #endif #include <stdio.h> #include <stdlib.h> #include <sys/types.h> #include <stdint.h> #include <libavutil/imgutils.h> #include <libavutil/samplefmt.h> #include <libavutil/frame.h> #include <libavutil/timestamp.h> #include <libavformat/avformat.h> #include <libavfilter/avfilter.h> #include <libavfilter/buffersrc.h> #include <libavfilter/buffersink.h> #include <libswscale/swscale.h> #include <libavutil/opt.h> #ifdef __cplusplus } #endif #include <iostream> using namespace std; int select_stream, got_frame, got_packet; AVFormatContext *in_fmt_ctx = NULL, *out_fmt_ctx = NULL; AVCodec *dec_codec = NULL, * enc_codec = NULL; AVStream *audio_st = NULL; AVCodecContext *enc_ctx = NULL, *dec_ctx = NULL; AVFrame *pFrame = NULL, * pFrameFiltered = NULL; AVFilterGraph *filter_graph = NULL; AVFilterContext *buffersrc_ctx = NULL; AVFilterContext *buffersink_ctx = NULL; AVPacket packet; string inFileName = "/home/vinod/vinod/Media/univac.webm"; string outFileName = "audio_extracted.m4a"; int target_bit_rate = 128000, sample_rate = 22050, channels = 1; AVSampleFormat sample_fmt = AV_SAMPLE_FMT_S16; string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono"; int log_averror(int errcode) { char *errbuf = (char *) calloc(AV_ERROR_MAX_STRING_SIZE, sizeof(char)); av_strerror(errcode, errbuf, AV_ERROR_MAX_STRING_SIZE); std::cout << "Error - " << errbuf << std::endl; delete [] errbuf; return -1; } /** * Initialize conversion filter */ int initialize_audio_filter() { char args[512]; int ret; AVFilter *buffersrc = avfilter_get_by_name("abuffer"); AVFilter *buffersink = avfilter_get_by_name("abuffersink"); AVFilterInOut *outputs = avfilter_inout_alloc(); AVFilterInOut *inputs = avfilter_inout_alloc(); filter_graph = avfilter_graph_alloc(); const enum AVSampleFormat out_sample_fmts[] = {sample_fmt, AV_SAMPLE_FMT_NONE}; const int64_t out_channel_layouts[] = {av_get_default_channel_layout(out_fmt_ctx -> streams[0] -> codec -> channels), -1}; const int out_sample_rates[] = {out_fmt_ctx -> streams[0] -> codec -> sample_rate, -1}; if (!dec_ctx->channel_layout) dec_ctx->channel_layout = av_get_default_channel_layout(dec_ctx->channels); snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64, in_fmt_ctx -> streams[select_stream] -> time_base.num, in_fmt_ctx -> streams[select_stream] -> time_base.den, dec_ctx->sample_rate, av_get_sample_fmt_name(dec_ctx->sample_fmt), dec_ctx->channel_layout); ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", args, NULL, filter_graph); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source/n"); return -1; } ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, filter_graph); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink/n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format/n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout/n"); return ret; } ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1, AV_OPT_SEARCH_CHILDREN); if (ret < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate/n"); return ret; } /* Endpoints for the filter graph. */ outputs -> name = av_strdup("in"); outputs -> filter_ctx = buffersrc_ctx; outputs -> pad_idx = 0; outputs -> next = NULL; /* Endpoints for the filter graph. */ inputs -> name = av_strdup("out"); inputs -> filter_ctx = buffersink_ctx; inputs -> pad_idx = 0; inputs -> next = NULL; string filter_desc = filter_description; if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_desc.c_str(), &inputs, &outputs, NULL)) < 0) { log_averror(ret); exit(1); } if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) { log_averror(ret); exit(1); } /* Print summary of the sink buffer * Note: args buffer is reused to store channel layout string */ AVFilterLink *outlink = buffersink_ctx->inputs[0]; av_get_channel_layout_string(args, sizeof(args), -1, outlink->channel_layout); av_log(NULL, AV_LOG_INFO, "Output: srate:%dHz fmt:%s chlayout:%s/n", (int) outlink->sample_rate, (char *) av_x_if_null(av_get_sample_fmt_name((AVSampleFormat) outlink->format), "?"), args); return 0; } /* * */ int main(int argc, char **argv) { int ret; cout << "Hello World" << endl; printf("abcd"); avcodec_register_all(); av_register_all(); avfilter_register_all(); /* open input file, and allocate format context */ if (avformat_open_input(&in_fmt_ctx, inFileName.c_str(), NULL, NULL) < 0) { std::cout << "error opening input file - " << inFileName << std::endl; return -1; } /* retrieve stream information */ if (avformat_find_stream_info(in_fmt_ctx, NULL) < 0) { std::cerr << "Could not find stream information in the input file " << inFileName << std::endl; } /* Dump format details */ printf("/n ---------------------------------------------------------------------- /n"); av_dump_format(in_fmt_ctx, 0, inFileName.c_str(), 0); printf("/n ---------------------------------------------------------------------- /n"); /* Choose a audio stream */ select_stream = av_find_best_stream(in_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec_codec, 0); if (select_stream == AVERROR_STREAM_NOT_FOUND) { std::cerr << "No audio stream found" << std::endl; return -1; } if (select_stream == AVERROR_DECODER_NOT_FOUND) { std::cerr << "No suitable decoder found" << std::endl; return -1; } dec_ctx = in_fmt_ctx -> streams[ select_stream] -> codec; av_opt_set_int(dec_ctx, "refcounted_frames", 1, 0); /* init the audio decoder */ if ((ret = avcodec_open2(dec_ctx, dec_codec, NULL)) < 0) { av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder/n"); return ret; } /* allocate output context */ ret = avformat_alloc_output_context2(&out_fmt_ctx, NULL, NULL, outFileName.c_str()); if (ret < 0) { std::cerr << "Could not create output context for the file " << outFileName << std::endl; return -1; } /* find the encoder */ enum AVCodecID codec_id = out_fmt_ctx -> oformat -> audio_codec; enc_codec = avcodec_find_encoder(codec_id); if (!(enc_codec)) { std::cerr << "Could not find encoder for - " << avcodec_get_name(codec_id) << std::endl; return -1; } /* add a new stream */ audio_st = avformat_new_stream(out_fmt_ctx, enc_codec); if (!audio_st) { std::cerr << "Could not add audio stream - " << std::endl; } /* Initialise audio codec */ audio_st -> id = out_fmt_ctx -> nb_streams - 1; enc_ctx = audio_st -> codec; enc_ctx -> codec_id = codec_id; enc_ctx -> codec_type = AVMEDIA_TYPE_AUDIO; enc_ctx -> bit_rate = target_bit_rate; enc_ctx -> sample_rate = sample_rate; enc_ctx -> sample_fmt = sample_fmt; enc_ctx -> channels = channels; enc_ctx -> channel_layout = av_get_default_channel_layout(enc_ctx -> channels); /* Some formats want stream headers to be separate. */ if (out_fmt_ctx -> oformat -> flags & AVFMT_GLOBALHEADER) { enc_ctx -> flags |= CODEC_FLAG_GLOBAL_HEADER; } ret = avcodec_open2(out_fmt_ctx -> streams[0] -> codec, enc_codec, NULL); if (ret < 0) { std::cerr << "Could not create codec context for the file " << outFileName << std::endl; return -1; } /* Initialize filter */ initialize_audio_filter(); if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) { int ret = avio_open(& out_fmt_ctx -> pb, outFileName.c_str(), AVIO_FLAG_WRITE); if (ret < 0) { log_averror(ret); return -1; } } /* Write header */ if (avformat_write_header(out_fmt_ctx, NULL) < 0) { if (ret < 0) { log_averror(ret); return -1; } } /* Allocate frame */ pFrame = av_frame_alloc(); if (!pFrame) { std::cerr << "Could not allocate frame/n"; return -1; } pFrameFiltered = av_frame_alloc(); if (!pFrameFiltered) { std::cerr << "Could not allocate frame/n"; return -1; } av_init_packet(&packet); packet.data = NULL; packet.size = 0; /* Read packet from the stream */ while (av_read_frame(in_fmt_ctx, &packet) >= 0) { if (packet.stream_index == select_stream) { avcodec_get_frame_defaults(pFrame); ret = avcodec_decode_audio4(dec_ctx, pFrame, &got_frame, &packet); if (ret < 0) { log_averror(ret); return ret; } printf("Decoded packet pts : %ld ", packet.pts); printf("Frame Best Effor pts : %ld /n", pFrame->best_effort_timestamp); /* Set frame pts */ pFrame -> pts = av_frame_get_best_effort_timestamp(pFrame); if (got_frame) { /* push the decoded frame into the filtergraph */ ret = av_buffersrc_add_frame_flags(buffersrc_ctx, pFrame, AV_BUFFERSRC_FLAG_KEEP_REF); if (ret < 0) { log_averror(ret); return ret; } /* pull filtered frames from the filtergraph */ while (1) { ret = av_buffersink_get_frame(buffersink_ctx, pFrameFiltered); if ((ret == AVERROR(EAGAIN)) || (ret == AVERROR_EOF)) { break; } if (ret < 0) { printf("Error while getting filtered frames from filtergraph/n"); log_averror(ret); return -1; } /* Initialize the packets */ AVPacket encodedPacket = {0}; av_init_packet(&encodedPacket); ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, pFrameFiltered, &got_packet); if (!ret && got_packet && encodedPacket.size) { /* Set correct pts and dts */ if (encodedPacket.pts != AV_NOPTS_VALUE) { encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } if (encodedPacket.dts != AV_NOPTS_VALUE) { encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } printf("Encoded packet pts %ld/n", encodedPacket.pts); /* Write the compressed frame to the media file. */ ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket); if (ret < 0) { log_averror(ret); return -1; } } else if (ret < 0) { log_averror(ret); return -1; } av_frame_unref(pFrameFiltered); } av_frame_unref(pFrame); } } } /* Flush delayed frames from encoder*/ got_packet=1; while (got_packet) { AVPacket encodedPacket = {0}; av_init_packet(&encodedPacket); ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, NULL, &got_packet); if (!ret && got_packet && encodedPacket.size) { /* Set correct pts and dts */ if (encodedPacket.pts != AV_NOPTS_VALUE) { encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } if (encodedPacket.dts != AV_NOPTS_VALUE) { encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base, out_fmt_ctx -> streams[0] -> time_base); } printf("Encoded packet pts %ld/n", encodedPacket.pts); /* Write the compressed frame to the media file. */ ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket); if (ret < 0) { log_averror(ret); return -1; } } else if (ret < 0) { log_averror(ret); return -1; } } /* Write Trailer */ av_write_trailer(out_fmt_ctx); avfilter_graph_free(&filter_graph); if (dec_ctx) avcodec_close(dec_ctx); avformat_close_input(&in_fmt_ctx); av_frame_free(&pFrame); av_frame_free(&pFrameFiltered); if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) avio_close(out_fmt_ctx -> pb); avcodec_close(out_fmt_ctx->streams[0]->codec); avformat_free_context(out_fmt_ctx); return 0; }

El archivo de audio después de la transcodificación tiene la misma duración que la entrada. Pero es completamente ruidoso. ¿Puede alguien decirme qué estoy haciendo mal aquí?