这部分内容的主要任务是把 mp4 格式中的音频流和视频流分离出来,分成两个文件,即 AAC 和 H264编码的不同音视频流。

其实用到的内容前面几节也有,我还是大概写一下重要内容的注释吧。

主函数部分:

int main(int argc, char* argv[]) {
    int ret;
    FILE* h264_file;
    FILE* aac_file;
    const char* mp4_filename;
    const char* h264_filename;
    const char* aac_filename;
    AVFormatContext* ifmt_ctx; // 多媒体流上下文
    AVPacket* packet;
    const AVBitStreamFilter* bsfilter; // 比特流过滤器
    AVBSFContext* bsf_ctx = NULL; // 比特流过滤器上下文
    int video_index = -1; // 视频索引
    int audio_index = -1; // 音频索引

    if (argc < 4) {
        return -1;
    }

    h264_filename = argv[2];
    aac_filename = argv[3];

    h264_file = fopen(h264_filename, "wb");
    if (!h264_file) {
        return -1;
    }

    aac_file = fopen(aac_filename, "wb");
    if (!aac_file) {
        return -1;
    }

    ifmt_ctx = avformat_alloc_context(); // 分配多媒体流上下文(其实在open_input中会自动分配)

    mp4_filename = argv[1];
    ret = avformat_open_input(&ifmt_ctx, mp4_filename, NULL, NULL);
    if (ret < 0) {
        printf("open input failed\n");
        avformat_close_input(&ifmt_ctx);
        return -1;
    }

    bsfilter = av_bsf_get_by_name("h264_mp4toannexb");

    ret = av_bsf_alloc(bsfilter, &bsf_ctx);

	// 寻找流
    video_index =
        av_find_best_stream(ifmt_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
    audio_index =
        av_find_best_stream(ifmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -2, NULL, 0);

	// 初始化过滤器上下文参数,包含编码格式,分辨率,帧率等
    avcodec_parameters_copy(bsf_ctx->par_in,
                            ifmt_ctx->streams[video_index]->codecpar);
    // 这里先拷贝再初始化的原因是这里是懒初始化,他需要先拷贝参数再根据参数初始化其他内容
    av_bsf_init(bsf_ctx);

    packet = av_packet_alloc();
    av_init_packet(packet);

    int while_cnt = 0;
    while (1) {
        while_cnt++;
        if (while_cnt % 10000 == 0) printf("while run %d times\n", while_cnt);
        ret = av_read_frame(ifmt_ctx, packet); // 每次读取一个压缩数据包

        if (ret < 0) { // 读不到了就是读取完毕
            av_packet_unref(packet);
            break;
        }
		// 如果是视频流
        if (packet->stream_index == video_index) {
            ret = av_bsf_send_packet(bsf_ctx, packet);

            if (ret < 0) {
                continue;
            }

            while (1) {
                ret = av_bsf_receive_packet(bsf_ctx, packet);
                if (ret < 0) {
                    break;
                }

                int write_size;
                write_size = fwrite(packet->data, 1, packet->size, h264_file);

                if (write_size != packet->size) {
                    // return -1;
                }

                av_packet_unref(packet);
            }
        } else if (packet->stream_index == audio_index) { // 如果是音频流
            char adts_header_buf[7] = {0}; // adts头缓冲区
            // 组织adts头
            adts_header(adts_header_buf, packet->size,
                        ifmt_ctx->streams[audio_index]->codecpar->profile,
                        ifmt_ctx->streams[audio_index]->codecpar->sample_rate,
                        ifmt_ctx->streams[audio_index]->codecpar->channels);
            ret = fwrite(adts_header_buf, 1, 7, aac_file);

            ret = fwrite(packet->data, 1, packet->size, aac_file);
            if (ret != packet->size) {
                // return -1;
            }

            av_packet_unref(packet);
        } else {
            av_packet_unref(packet);
        }
    }

    printf("demux finished\n");

    if (h264_file) fclose(h264_file);

    if (aac_file) fclose(aac_file);

    if (packet) av_packet_free(&packet);

    av_bsf_free(&bsf_ctx);
    avformat_close_input(&ifmt_ctx);

    return 0;
}

adts_header encoder:

image.png

image.png

const int sampling_frequencies[] = {
    96000,  // 0x0
    88200,  // 0x1
    64000,  // 0x2
    48000,  // 0x3
    44100,  // 0x4
    32000,  // 0x5
    24000,  // 0x6
    22050,  // 0x7
    16000,  // 0x8
    12000,  // 0x9
    11025,  // 0xa
    8000    // 0xb
    // 0xc d e f是保留的
};

int adts_header(char* adts_header_buf, const int data_length, const int profile,
                const int samplerate, const int channels) {
    int sampling_frequencies_index = 3;
    int adts_total_len = data_length + 7;

    int sf_index_nb =
        sizeof(sampling_frequencies) / sizeof(sampling_frequencies[0]);

    for (int i = 0; i < sf_index_nb; i++) {
        if (samplerate == sampling_frequencies[i]) {
            sampling_frequencies_index = i;
            break;
        }
    }

    adts_header_buf[0] = 0xFF;
    adts_header_buf[1] = 0xF0;
    adts_header_buf[1] |= (0 << 3);
    adts_header_buf[1] |= (0 << 1);
    adts_header_buf[1] |= 1;

    adts_header_buf[2] = profile << 6;
    adts_header_buf[2] |= (sampling_frequencies_index & 0x0F) << 2;
    adts_header_buf[2] |= 0 << 1;
    adts_header_buf[2] |= (channels & 0x04) >> 2;

    adts_header_buf[3] = (channels & 0x03) << 6;
    adts_header_buf[3] |= 0 << 5;
    adts_header_buf[3] |= 0 << 4;
    adts_header_buf[3] |= 0 << 3;
    adts_header_buf[3] |= 0 << 2;
    adts_header_buf[3] |= ((adts_total_len & 0x1800) >> 11);

    adts_header_buf[4] = (uint8_t)((adts_total_len & 0x7F8) >> 3);
    adts_header_buf[5] = (uint8_t)((adts_total_len & 0x7) << 5);
    adts_header_buf[5] |= 0x1F;
    adts_header_buf[6] = 0xFC;

    return 0;
}