| /* |
| * Copyright (c) 2023, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 3-Clause Clear |
| * License and the Alliance for Open Media Patent License 1.0. If the BSD |
| * 3-Clause Clear License was not distributed with this source code in the |
| * LICENSE file, you can obtain it at |
| * www.aomedia.org/license/software-license/bsd-3-c-c. If the Alliance for |
| * Open Media Patent License 1.0 was not distributed with this source code |
| * in the PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| #include "iamf/cli/demixing_module.h" |
| |
| #include <algorithm> |
| #include <array> |
| #include <cstdint> |
| #include <iterator> |
| #include <list> |
| #include <optional> |
| #include <utility> |
| #include <vector> |
| |
| #include "absl/container/flat_hash_map.h" |
| #include "absl/status/status.h" |
| #include "absl/status/status_matchers.h" |
| #include "absl/status/statusor.h" |
| #include "absl/types/span.h" |
| #include "gmock/gmock.h" |
| #include "gtest/gtest.h" |
| #include "iamf/cli/audio_element_with_data.h" |
| #include "iamf/cli/audio_frame_decoder.h" |
| #include "iamf/cli/audio_frame_with_data.h" |
| #include "iamf/cli/channel_label.h" |
| #include "iamf/cli/proto/user_metadata.pb.h" |
| #include "iamf/cli/proto_conversion/channel_label_utils.h" |
| #include "iamf/cli/proto_conversion/downmixing_reconstruction_util.h" |
| #include "iamf/cli/tests/cli_test_utils.h" |
| #include "iamf/common/utils/numeric_utils.h" |
| #include "iamf/obu/audio_element.h" |
| #include "iamf/obu/audio_frame.h" |
| #include "iamf/obu/codec_config.h" |
| #include "iamf/obu/demixing_info_parameter_data.h" |
| #include "iamf/obu/obu_header.h" |
| #include "iamf/obu/recon_gain_info_parameter_data.h" |
| #include "iamf/obu/types.h" |
| |
| namespace iamf_tools { |
| namespace { |
| |
| using ::absl_testing::IsOk; |
| using ::absl_testing::IsOkAndHolds; |
| using enum ChannelLabel::Label; |
| using ::testing::DoubleEq; |
| using ::testing::DoubleNear; |
| using ::testing::IsEmpty; |
| using ::testing::Not; |
| using ::testing::Pointwise; |
| |
| constexpr DecodedUleb128 kAudioElementId = 137; |
| constexpr std::array<uint8_t, 12> kReconGainValues = { |
| 255, 0, 125, 200, 150, 255, 255, 255, 255, 255, 255, 255}; |
| const uint32_t kZeroSamplesToTrimAtEnd = 0; |
| const uint32_t kZeroSamplesToTrimAtStart = 0; |
| const int kStartTimestamp = 0; |
| const int kEndTimestamp = 4; |
| const DecodedUleb128 kMonoSubstreamId = 0; |
| const DecodedUleb128 kL2SubstreamId = 1; |
| |
| // TODO(b/305927287): Test computation of linear output gains. Test some cases |
| // of erroneous input. |
| |
| TEST(FindSamplesOrDemixedSamples, FindsMatchingSamples) { |
| const std::vector<InternalSampleType> kSamplesToFind = {1, 2, 3}; |
| const LabelSamplesMap kLabelToSamples = {{kL2, kSamplesToFind}}; |
| |
| const std::vector<InternalSampleType>* found_samples; |
| EXPECT_THAT(DemixingModule::FindSamplesOrDemixedSamples(kL2, kLabelToSamples, |
| &found_samples), |
| IsOk()); |
| EXPECT_THAT(*found_samples, Pointwise(DoubleEq(), kSamplesToFind)); |
| } |
| |
| TEST(FindSamplesOrDemixedSamples, FindsMatchingDemixedSamples) { |
| const std::vector<InternalSampleType> kSamplesToFind = {1, 2, 3}; |
| const LabelSamplesMap kLabelToSamples = {{kDemixedR2, kSamplesToFind}}; |
| |
| const std::vector<InternalSampleType>* found_samples; |
| EXPECT_THAT(DemixingModule::FindSamplesOrDemixedSamples(kR2, kLabelToSamples, |
| &found_samples), |
| IsOk()); |
| EXPECT_THAT(*found_samples, Pointwise(DoubleEq(), kSamplesToFind)); |
| } |
| |
| TEST(FindSamplesOrDemixedSamples, InvalidWhenThereIsNoDemixingLabel) { |
| const std::vector<InternalSampleType> kSamplesToFind = {1, 2, 3}; |
| const LabelSamplesMap kLabelToSamples = {{kDemixedR2, kSamplesToFind}}; |
| |
| const std::vector<InternalSampleType>* found_samples; |
| EXPECT_FALSE(DemixingModule::FindSamplesOrDemixedSamples(kL2, kLabelToSamples, |
| &found_samples) |
| .ok()); |
| } |
| |
| TEST(FindSamplesOrDemixedSamples, RegularSamplesTakePrecedence) { |
| const std::vector<InternalSampleType> kSamplesToFind = {1, 2, 3}; |
| const std::vector<InternalSampleType> kDemixedSamplesToIgnore = {4, 5, 6}; |
| const LabelSamplesMap kLabelToSamples = { |
| {kR2, kSamplesToFind}, {kDemixedR2, kDemixedSamplesToIgnore}}; |
| const std::vector<InternalSampleType>* found_samples; |
| EXPECT_THAT(DemixingModule::FindSamplesOrDemixedSamples(kR2, kLabelToSamples, |
| &found_samples), |
| IsOk()); |
| EXPECT_THAT(*found_samples, Pointwise(DoubleEq(), kSamplesToFind)); |
| } |
| |
| TEST(FindSamplesOrDemixedSamples, ErrorNoMatchingSamples) { |
| const std::vector<InternalSampleType> kSamplesToFind = {1, 2, 3}; |
| const LabelSamplesMap kLabelToSamples = {{kL2, kSamplesToFind}}; |
| |
| const std::vector<InternalSampleType>* found_samples; |
| EXPECT_FALSE(DemixingModule::FindSamplesOrDemixedSamples(kL3, kLabelToSamples, |
| &found_samples) |
| .ok()); |
| } |
| |
| void InitAudioElementWithLabelsAndLayers( |
| const SubstreamIdLabelsMap& substream_id_to_labels, |
| const std::vector<ChannelAudioLayerConfig::LoudspeakerLayout>& |
| loudspeaker_layouts, |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData>& audio_elements) { |
| auto [iter, unused_inserted] = audio_elements.emplace( |
| kAudioElementId, |
| AudioElementWithData{ |
| .obu = AudioElementObu(ObuHeader(), kAudioElementId, |
| AudioElementObu::kAudioElementChannelBased, |
| /*reserved=*/0, |
| /*codec_config_id=*/0), |
| .substream_id_to_labels = substream_id_to_labels, |
| }); |
| auto& obu = iter->second.obu; |
| ASSERT_THAT( |
| obu.InitializeScalableChannelLayout(loudspeaker_layouts.size(), 0), |
| IsOk()); |
| auto& config = std::get<ScalableChannelLayoutConfig>(obu.config_); |
| for (int i = 0; i < loudspeaker_layouts.size(); ++i) { |
| config.channel_audio_layer_configs[i].loudspeaker_layout = |
| loudspeaker_layouts[i]; |
| } |
| } |
| |
| TEST(CreateForDownMixingAndReconstruction, EmptyConfigMapIsOk) { |
| absl::flat_hash_map<DecodedUleb128, |
| DemixingModule::DownmixingAndReconstructionConfig> |
| id_to_config_map; |
| const auto demixing_module = |
| DemixingModule::CreateForDownMixingAndReconstruction( |
| std::move(id_to_config_map)); |
| EXPECT_THAT(demixing_module, IsOk()); |
| } |
| |
| TEST(CreateForDownMixingAndReconstruction, ValidWithTwoLayerStereo) { |
| DecodedUleb128 id = 137; |
| DemixingModule::DownmixingAndReconstructionConfig config = { |
| .user_labels = {kL2, kR2}, |
| .substream_id_to_labels = {{0, {kMono}}, {1, {kL2}}}, |
| .label_to_output_gain = {}}; |
| absl::flat_hash_map<DecodedUleb128, |
| DemixingModule::DownmixingAndReconstructionConfig> |
| id_to_config_map = {{id, config}}; |
| const auto demixing_module = |
| DemixingModule::CreateForDownMixingAndReconstruction( |
| std::move(id_to_config_map)); |
| EXPECT_THAT(demixing_module, IsOk()); |
| } |
| |
| TEST(InitializeForReconstruction, NeverCreatesDownMixers) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers({{0, {kMono}}, {1, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const std::list<Demixer>* down_mixers = nullptr; |
| EXPECT_THAT(demixing_module->GetDownMixers(kAudioElementId, down_mixers), |
| IsOk()); |
| EXPECT_TRUE(down_mixers->empty()); |
| } |
| |
| TEST(CreateForReconstruction, CreatesOneDemixerForTwoLayerStereo) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers({{0, {kMono}}, {1, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const std::list<Demixer>* demixer = nullptr; |
| EXPECT_THAT(demixing_module->GetDemixers(kAudioElementId, demixer), IsOk()); |
| EXPECT_EQ(demixer->size(), 1); |
| } |
| |
| TEST(CreateForReconstruction, FailsForReservedLayout14) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{0, {kOmitted}}}, {ChannelAudioLayerConfig::kLayoutReserved14}, |
| audio_elements); |
| |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| |
| EXPECT_FALSE(demixing_module.ok()); |
| } |
| |
| TEST(CreateForReconstruction, ValidForExpandedLayoutLFE) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{0, {kLFE}}}, {ChannelAudioLayerConfig::kLayoutExpanded}, |
| audio_elements); |
| std::get<ScalableChannelLayoutConfig>( |
| audio_elements.at(kAudioElementId).obu.config_) |
| .channel_audio_layer_configs[0] |
| .expanded_loudspeaker_layout = |
| ChannelAudioLayerConfig::kExpandedLayoutLFE; |
| |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| |
| EXPECT_THAT(demixing_module, IsOk()); |
| } |
| |
| TEST(CreateForReconstruction, CreatesNoDemixersForSingleLayerChannelBased) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers({{0, {kL2, kR2}}}, |
| {ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const std::list<Demixer>* demixer = nullptr; |
| EXPECT_THAT(demixing_module->GetDemixers(kAudioElementId, demixer), IsOk()); |
| EXPECT_TRUE(demixer->empty()); |
| } |
| |
| TEST(CreateForReconstruction, CreatesNoDemixersForAmbisonics) { |
| const DecodedUleb128 kCodecConfigId = 0; |
| constexpr std::array<DecodedUleb128, 4> kAmbisonicsSubstreamIds{0, 1, 2, 3}; |
| absl::flat_hash_map<DecodedUleb128, CodecConfigObu> codec_configs; |
| AddLpcmCodecConfigWithIdAndSampleRate(kCodecConfigId, 48000, codec_configs); |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| AddAmbisonicsMonoAudioElementWithSubstreamIds(kAudioElementId, kCodecConfigId, |
| kAmbisonicsSubstreamIds, |
| codec_configs, audio_elements); |
| |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const std::list<Demixer>* demixer = nullptr; |
| EXPECT_THAT(demixing_module->GetDemixers(kAudioElementId, demixer), IsOk()); |
| EXPECT_TRUE(demixer->empty()); |
| } |
| |
| TEST(DemixOriginalAudioSamples, ReturnsErrorAfterCreateForReconstruction) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| EXPECT_THAT(demixing_module->DemixOriginalAudioSamples({}), Not(IsOk())); |
| } |
| |
| TEST(DemixDecodedAudioSamples, OutputContainsOriginalAndDemixedSamples) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| std::list<DecodedAudioFrame> decoded_audio_frames; |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kMonoSubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams()}); |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kL2SubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams()}); |
| auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| const auto id_to_labeled_decoded_frame = |
| demixing_module->DemixDecodedAudioSamples(decoded_audio_frames); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| const auto& labeled_frame = id_to_labeled_decoded_frame->at(kAudioElementId); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kL2)); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kMono)); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kDemixedR2)); |
| } |
| |
| TEST(DemixDecodedAudioSamples, OutputEchoesTimingInformation) { |
| // These values are not very sensible, but as long as they are consistent |
| // between related frames it is OK. |
| const DecodedUleb128 kExpectedStartTimestamp = 99; |
| const DecodedUleb128 kExpectedEndTimestamp = 123; |
| const DecodedUleb128 kExpectedNumSamplesToTrimAtEnd = 999; |
| const DecodedUleb128 kExpectedNumSamplesToTrimAtStart = 9999; |
| const DecodedUleb128 kL2SubstreamId = 1; |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| std::list<DecodedAudioFrame> decoded_audio_frames; |
| decoded_audio_frames.push_back(DecodedAudioFrame{ |
| .substream_id = kMonoSubstreamId, |
| .start_timestamp = kExpectedStartTimestamp, |
| .end_timestamp = kExpectedEndTimestamp, |
| .samples_to_trim_at_end = kExpectedNumSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kExpectedNumSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams()}); |
| decoded_audio_frames.push_back(DecodedAudioFrame{ |
| .substream_id = kL2SubstreamId, |
| .start_timestamp = kExpectedStartTimestamp, |
| .end_timestamp = kExpectedEndTimestamp, |
| .samples_to_trim_at_end = kExpectedNumSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kExpectedNumSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams()}); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const auto id_to_labeled_decoded_frame = |
| demixing_module->DemixDecodedAudioSamples(decoded_audio_frames); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| const auto& labeled_frame = id_to_labeled_decoded_frame->at(kAudioElementId); |
| EXPECT_EQ(labeled_frame.end_timestamp, kExpectedEndTimestamp); |
| EXPECT_EQ(labeled_frame.samples_to_trim_at_end, |
| kExpectedNumSamplesToTrimAtEnd); |
| EXPECT_EQ(labeled_frame.samples_to_trim_at_start, |
| kExpectedNumSamplesToTrimAtStart); |
| } |
| |
| TEST(DemixDecodedAudioSamples, OutputEchoesOriginalLabels) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| std::list<DecodedAudioFrame> decoded_audio_frames; |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kMonoSubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{1}, {2}, {3}}, |
| .down_mixing_params = DownMixingParams()}); |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kL2SubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{9}, {10}, {11}}, |
| .down_mixing_params = DownMixingParams()}); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| IdLabeledFrameMap unused_id_labeled_frame; |
| const auto id_to_labeled_decoded_frame = |
| demixing_module->DemixDecodedAudioSamples(decoded_audio_frames); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| // Examine the demixed frame. |
| const auto& labeled_frame = id_to_labeled_decoded_frame->at(kAudioElementId); |
| constexpr std::array<int32_t, 3> kExpectedMonoSamples = {1, 2, 3}; |
| constexpr std::array<int32_t, 3> kExpectedL2Samples = {9, 10, 11}; |
| EXPECT_THAT( |
| labeled_frame.label_to_samples.at(kMono), |
| Pointwise(InternalSampleMatchesIntegralSample(), kExpectedMonoSamples)); |
| EXPECT_THAT( |
| labeled_frame.label_to_samples.at(kL2), |
| Pointwise(InternalSampleMatchesIntegralSample(), kExpectedL2Samples)); |
| } |
| |
| TEST(DemixDecodedAudioSamples, OutputHasReconstructedLayers) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| std::list<DecodedAudioFrame> decoded_audio_frames; |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kMonoSubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{750}}, |
| .down_mixing_params = DownMixingParams()}); |
| decoded_audio_frames.push_back( |
| DecodedAudioFrame{.substream_id = kL2SubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{1000}}, |
| .down_mixing_params = DownMixingParams()}); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| const auto id_to_labeled_decoded_frame = |
| demixing_module->DemixDecodedAudioSamples(decoded_audio_frames); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| // Examine the demixed frame. |
| const auto& labeled_frame = id_to_labeled_decoded_frame->at(kAudioElementId); |
| // D_R2 = M - (L2 - 6 dB) + 6 dB. |
| EXPECT_THAT(labeled_frame.label_to_samples.at(kDemixedR2), |
| Pointwise(InternalSampleMatchesIntegralSample(), {500})); |
| } |
| |
| TEST(DemixDecodedAudioSamples, OutputContainsReconGainAndLayerInfo) { |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements; |
| InitAudioElementWithLabelsAndLayers( |
| {{kMonoSubstreamId, {kMono}}, {kL2SubstreamId, {kL2}}}, |
| {ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo}, |
| audio_elements); |
| std::list<DecodedAudioFrame> decoded_audio_frames; |
| ReconGainInfoParameterData recon_gain_info_parameter_data; |
| recon_gain_info_parameter_data.recon_gain_elements.push_back(ReconGainElement{ |
| .recon_gain_flag = DecodedUleb128(1), .recon_gain = kReconGainValues}); |
| decoded_audio_frames.push_back(DecodedAudioFrame{ |
| .substream_id = kMonoSubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams(), |
| .recon_gain_info_parameter_data = recon_gain_info_parameter_data, |
| .audio_element_with_data = &audio_elements.at(kAudioElementId)}); |
| decoded_audio_frames.push_back(DecodedAudioFrame{ |
| .substream_id = kL2SubstreamId, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = {{0}}, |
| .down_mixing_params = DownMixingParams(), |
| .recon_gain_info_parameter_data = recon_gain_info_parameter_data, |
| .audio_element_with_data = &audio_elements.at(kAudioElementId)}); |
| const auto demixing_module = |
| DemixingModule::CreateForReconstruction(audio_elements); |
| ASSERT_THAT(demixing_module, IsOk()); |
| const auto id_to_labeled_decoded_frame = |
| demixing_module->DemixDecodedAudioSamples(decoded_audio_frames); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| const auto& labeled_frame = id_to_labeled_decoded_frame->at(kAudioElementId); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kL2)); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kMono)); |
| EXPECT_TRUE(labeled_frame.label_to_samples.contains(kDemixedR2)); |
| |
| EXPECT_EQ( |
| labeled_frame.recon_gain_info_parameter_data.recon_gain_elements.size(), |
| 1); |
| const auto& recon_gain_element = |
| labeled_frame.recon_gain_info_parameter_data.recon_gain_elements.at(0); |
| ASSERT_TRUE(recon_gain_element.has_value()); |
| EXPECT_EQ(recon_gain_element->recon_gain_flag, DecodedUleb128(1)); |
| EXPECT_THAT(recon_gain_element->recon_gain, |
| testing::ElementsAreArray(kReconGainValues)); |
| EXPECT_EQ(labeled_frame.loudspeaker_layout_per_layer.size(), 2); |
| EXPECT_THAT(labeled_frame.loudspeaker_layout_per_layer, |
| testing::ElementsAre(ChannelAudioLayerConfig::kLayoutMono, |
| ChannelAudioLayerConfig::kLayoutStereo)); |
| } |
| |
| class DemixingModuleTestBase { |
| public: |
| DemixingModuleTestBase() { |
| audio_frame_metadata_.set_audio_element_id(kAudioElementId); |
| } |
| |
| void CreateDemixingModuleExpectOk() { |
| iamf_tools_cli_proto::UserMetadata user_metadata; |
| *user_metadata.add_audio_frame_metadata() = audio_frame_metadata_; |
| audio_elements_.emplace( |
| kAudioElementId, |
| AudioElementWithData{ |
| .obu = AudioElementObu(ObuHeader(), kAudioElementId, |
| AudioElementObu::kAudioElementChannelBased, |
| /*reserved=*/0, |
| /*codec_config_id=*/0), |
| .substream_id_to_labels = substream_id_to_labels_, |
| }); |
| const absl::StatusOr<absl::flat_hash_map< |
| DecodedUleb128, DemixingModule::DownmixingAndReconstructionConfig>> |
| audio_element_id_to_demixing_metadata = |
| CreateAudioElementIdToDemixingMetadata(user_metadata, |
| audio_elements_); |
| ASSERT_THAT(audio_element_id_to_demixing_metadata.status(), IsOk()); |
| auto demixing_module = DemixingModule::CreateForDownMixingAndReconstruction( |
| std::move(audio_element_id_to_demixing_metadata.value())); |
| ASSERT_THAT(demixing_module, IsOk()); |
| demixing_module_.emplace(*std::move(demixing_module)); |
| } |
| |
| void TestCreateDemixingModule(int expected_number_of_down_mixers) { |
| CreateDemixingModuleExpectOk(); |
| const std::list<Demixer>* down_mixers = nullptr; |
| const std::list<Demixer>* demixers = nullptr; |
| |
| ASSERT_THAT(demixing_module_->GetDownMixers(kAudioElementId, down_mixers), |
| IsOk()); |
| ASSERT_THAT(demixing_module_->GetDemixers(kAudioElementId, demixers), |
| IsOk()); |
| EXPECT_EQ(down_mixers->size(), expected_number_of_down_mixers); |
| EXPECT_EQ(demixers->size(), expected_number_of_down_mixers); |
| } |
| |
| protected: |
| void ConfigureAudioFrameMetadata( |
| absl::Span<const ChannelLabel::Label> labels) { |
| for (const auto& label : labels) { |
| auto proto_label = ChannelLabelUtils::LabelToProto(label); |
| ASSERT_TRUE(proto_label.ok()); |
| audio_frame_metadata_.add_channel_metadatas()->set_channel_label( |
| *proto_label); |
| } |
| } |
| |
| iamf_tools_cli_proto::AudioFrameObuMetadata audio_frame_metadata_; |
| absl::flat_hash_map<DecodedUleb128, AudioElementWithData> audio_elements_; |
| SubstreamIdLabelsMap substream_id_to_labels_; |
| |
| // Held in `std::optional` for delayed construction. |
| std::optional<DemixingModule> demixing_module_; |
| }; |
| |
| class DownMixingModuleTest : public DemixingModuleTestBase, |
| public ::testing::Test { |
| protected: |
| void TestDownMixing(const DownMixingParams& down_mixing_params, |
| int expected_number_of_down_mixers) { |
| TestCreateDemixingModule(expected_number_of_down_mixers); |
| |
| EXPECT_THAT(demixing_module_->DownMixSamplesToSubstreams( |
| kAudioElementId, down_mixing_params, |
| input_label_to_samples_, substream_id_to_substream_data_), |
| IsOk()); |
| |
| for (const auto& [substream_id, substream_data] : |
| substream_id_to_substream_data_) { |
| // Copy the output queue to a vector for comparison. |
| std::vector<std::vector<int32_t>> output_samples; |
| std::copy(substream_data.samples_obu.begin(), |
| substream_data.samples_obu.end(), |
| std::back_inserter(output_samples)); |
| EXPECT_EQ(output_samples, |
| substream_id_to_expected_samples_[substream_id]); |
| } |
| } |
| |
| void ConfigureInputChannel(ChannelLabel::Label label, |
| absl::Span<const int32_t> input_samples) { |
| ConfigureAudioFrameMetadata({label}); |
| |
| auto [iter, inserted] = input_label_to_samples_.emplace( |
| label, std::vector<InternalSampleType>(input_samples.size(), 0)); |
| Int32ToInternalSampleType(input_samples, absl::MakeSpan(iter->second)); |
| // This function should not be called with the same label twice. |
| ASSERT_TRUE(inserted); |
| } |
| |
| void ConfigureOutputChannel( |
| const std::list<ChannelLabel::Label>& requested_output_labels, |
| const std::vector<std::vector<int32_t>>& expected_output_smples) { |
| // The substream ID itself does not matter. Generate a unique one. |
| const uint32_t substream_id = substream_id_to_labels_.size(); |
| |
| substream_id_to_labels_[substream_id] = requested_output_labels; |
| substream_id_to_substream_data_[substream_id] = {.substream_id = |
| substream_id}; |
| |
| substream_id_to_expected_samples_[substream_id] = expected_output_smples; |
| } |
| |
| LabelSamplesMap input_label_to_samples_; |
| |
| absl::flat_hash_map<uint32_t, SubstreamData> substream_id_to_substream_data_; |
| |
| absl::flat_hash_map<uint32_t, std::vector<std::vector<int32_t>>> |
| substream_id_to_expected_samples_; |
| }; |
| |
| TEST_F(DownMixingModuleTest, OneLayerStereoHasNoDownMixers) { |
| ConfigureInputChannel(kL2, {}); |
| ConfigureInputChannel(kR2, {}); |
| |
| ConfigureOutputChannel({kL2, kR2}, {{}}); |
| |
| TestCreateDemixingModule(0); |
| } |
| |
| TEST_F(DownMixingModuleTest, OneLayer7_1_4HasNoDownMixers) { |
| // Initialize arguments for single layer 7.1.4. |
| ConfigureInputChannel(kL7, {}); |
| ConfigureInputChannel(kR7, {}); |
| ConfigureInputChannel(kCentre, {}); |
| ConfigureInputChannel(kLFE, {}); |
| ConfigureInputChannel(kLss7, {}); |
| ConfigureInputChannel(kRss7, {}); |
| ConfigureInputChannel(kLrs7, {}); |
| ConfigureInputChannel(kRrs7, {}); |
| ConfigureInputChannel(kLtf4, {}); |
| ConfigureInputChannel(kRtf4, {}); |
| ConfigureInputChannel(kLtb4, {}); |
| ConfigureInputChannel(kRtb4, {}); |
| |
| ConfigureOutputChannel({kCentre}, {{}}); |
| ConfigureOutputChannel({kL7, kR7}, {}); |
| ConfigureOutputChannel({kLss7, kRss7}, {}); |
| ConfigureOutputChannel({kLrs7, kRrs7}, {}); |
| ConfigureOutputChannel({kLtf4, kRtf4}, {}); |
| ConfigureOutputChannel({kLtb4, kRtb4}, {}); |
| ConfigureOutputChannel({kLFE}, {}); |
| |
| TestCreateDemixingModule(0); |
| } |
| |
| TEST_F(DownMixingModuleTest, AmbisonicsHasNoDownMixers) { |
| ConfigureInputChannel(kA0, {}); |
| ConfigureInputChannel(kA1, {}); |
| ConfigureInputChannel(kA2, {}); |
| ConfigureInputChannel(kA3, {}); |
| |
| ConfigureOutputChannel({kA0}, {{}}); |
| ConfigureOutputChannel({kA1}, {{}}); |
| ConfigureOutputChannel({kA2}, {{}}); |
| ConfigureOutputChannel({kA3}, {{}}); |
| |
| TestCreateDemixingModule(0); |
| } |
| |
| TEST_F(DownMixingModuleTest, OneLayerStereo) { |
| ConfigureInputChannel(kL2, {0, 1, 2, 3}); |
| ConfigureInputChannel(kR2, {100, 101, 102, 103}); |
| |
| // Down-mix to stereo as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kL2, kR2}, {{0, 100}, {1, 101}, {2, 102}, {3, 103}}); |
| |
| TestDownMixing({}, 0); |
| } |
| |
| TEST_F(DownMixingModuleTest, S2ToS1DownMixer) { |
| ConfigureInputChannel(kL2, {0, 100, 500, 1000}); |
| ConfigureInputChannel(kR2, {100, 0, 500, 500}); |
| |
| // Down-mix to stereo as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kL2}, {{0}, {100}, {500}, {1000}}); |
| |
| // Down-mix to mono as the lowest layer. |
| // M = (L2 - 6 dB) + (R2 - 6 dB). |
| ConfigureOutputChannel({kMono}, {{50}, {50}, {500}, {750}}); |
| |
| TestDownMixing({}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, S3ToS2DownMixer) { |
| ConfigureInputChannel(kL3, {0, 100}); |
| ConfigureInputChannel(kR3, {0, 100}); |
| ConfigureInputChannel(kCentre, {100, 100}); |
| ConfigureInputChannel(kLtf3, {99999, 99999}); |
| ConfigureInputChannel(kRtf3, {99998, 99998}); |
| |
| // Down-mix to 3.1.2 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kCentre}, {{100}, {100}}); |
| ConfigureOutputChannel({kLtf3, kRtf3}, {{99999, 99998}, {99999, 99998}}); |
| |
| // Down-mix to stereo as the lowest layer. |
| // L2 = L3 + (C - 3 dB). |
| // R2 = R3 + (C - 3 dB). |
| ConfigureOutputChannel({kL2, kR2}, {{70, 70}, {170, 170}}); |
| |
| TestDownMixing({}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, S5ToS3ToS2DownMixer) { |
| ConfigureInputChannel(kL5, {100}); |
| ConfigureInputChannel(kR5, {200}); |
| ConfigureInputChannel(kCentre, {1000}); |
| ConfigureInputChannel(kLs5, {2000}); |
| ConfigureInputChannel(kRs5, {3000}); |
| ConfigureInputChannel(kLFE, {6}); |
| |
| // Down-mix to 5.1 as the highest layer. The highest layer always matches the |
| // original input. |
| ConfigureOutputChannel({kCentre}, {{1000}}); |
| ConfigureOutputChannel({kLs5, kRs5}, {{2000, 3000}}); |
| ConfigureOutputChannel({kLFE}, {{6}}); |
| |
| // Down-mix to stereo as the lowest layer. |
| // L3 = L5 + Ls5 * delta. |
| // L2 = L3 + (C - 3 dB). |
| ConfigureOutputChannel({kL2, kR2}, {{2221, 3028}}); |
| |
| // Internally there is a down-mixer to L3/R3 then another for L2/R2. |
| TestDownMixing({.delta = .707}, 2); |
| } |
| |
| TEST_F(DownMixingModuleTest, S5ToS3ToDownMixer) { |
| ConfigureInputChannel(kL5, {1000}); |
| ConfigureInputChannel(kR5, {2000}); |
| ConfigureInputChannel(kCentre, {3}); |
| ConfigureInputChannel(kLs5, {4000}); |
| ConfigureInputChannel(kRs5, {8000}); |
| ConfigureInputChannel(kLtf2, {1000}); |
| ConfigureInputChannel(kRtf2, {2000}); |
| ConfigureInputChannel(kLFE, {8}); |
| |
| // Down-mix to 5.1.2 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kLs5, kRs5}, {{4000, 8000}}); |
| |
| // Down-mix to 3.1.2 as the lowest layer. |
| // L3 = L5 + Ls5 * delta. |
| ConfigureOutputChannel({kL3, kR3}, {{3828, 7656}}); |
| ConfigureOutputChannel({kCentre}, {{3}}); |
| // Ltf3 = Ltf2 + Ls5 * w * delta. |
| ConfigureOutputChannel({kLtf3, kRtf3}, {{1707, 3414}}); |
| ConfigureOutputChannel({kLFE}, {{8}}); |
| |
| // Internally there is a down-mixer for the height and another for the |
| // surround. |
| TestDownMixing({.delta = .707, .w = 0.25}, 2); |
| } |
| |
| TEST_F(DownMixingModuleTest, T4ToT2DownMixer) { |
| ConfigureInputChannel(kL5, {1}); |
| ConfigureInputChannel(kR5, {2}); |
| ConfigureInputChannel(kCentre, {3}); |
| ConfigureInputChannel(kLs5, {4}); |
| ConfigureInputChannel(kRs5, {5}); |
| ConfigureInputChannel(kLtf4, {1000}); |
| ConfigureInputChannel(kRtf4, {2000}); |
| ConfigureInputChannel(kLtb4, {1000}); |
| ConfigureInputChannel(kRtb4, {2000}); |
| ConfigureInputChannel(kLFE, {10}); |
| |
| // Down-mix to 5.1.4 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kLtb4, kRtb4}, {{1000, 2000}}); |
| |
| // Down-mix to 5.1.2 as the lowest layer. |
| ConfigureOutputChannel({kL5, kR5}, {{1, 2}}); |
| ConfigureOutputChannel({kCentre}, {{3}}); |
| ConfigureOutputChannel({kLs5, kRs5}, {{4, 5}}); |
| // Ltf2 = Ltf4 + Ltb4 * gamma. |
| ConfigureOutputChannel({kLtf2, kRtf2}, {{1707, 3414}}); |
| ConfigureOutputChannel({kLFE}, {{10}}); |
| |
| TestDownMixing({.gamma = .707}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, S7ToS5DownMixerWithoutT0) { |
| ConfigureInputChannel(kL7, {1}); |
| ConfigureInputChannel(kR7, {2}); |
| ConfigureInputChannel(kCentre, {3}); |
| ConfigureInputChannel(kLss7, {1000}); |
| ConfigureInputChannel(kRss7, {2000}); |
| ConfigureInputChannel(kLrs7, {3000}); |
| ConfigureInputChannel(kRrs7, {4000}); |
| ConfigureInputChannel(kLFE, {8}); |
| |
| // Down-mix to 7.1.0 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kLrs7, kRrs7}, {{3000, 4000}}); |
| |
| // Down-mix to 5.1.0 as the lowest layer. |
| ConfigureOutputChannel({kL5, kR5}, {{1, 2}}); |
| ConfigureOutputChannel({kCentre}, {{3}}); |
| // Ls5 = Lss7 * alpha + Lrs7 * beta. |
| ConfigureOutputChannel({kLs5, kRs5}, {{3598, 5464}}); |
| ConfigureOutputChannel({kLFE}, {{8}}); |
| |
| TestDownMixing({.alpha = 1, .beta = .866}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, S7ToS5DownMixerWithT2) { |
| ConfigureInputChannel(kL7, {1}); |
| ConfigureInputChannel(kR7, {2}); |
| ConfigureInputChannel(kCentre, {3}); |
| ConfigureInputChannel(kLss7, {1000}); |
| ConfigureInputChannel(kRss7, {2000}); |
| ConfigureInputChannel(kLrs7, {3000}); |
| ConfigureInputChannel(kRrs7, {4000}); |
| ConfigureInputChannel(kLtf2, {8}); |
| ConfigureInputChannel(kRtf2, {9}); |
| ConfigureInputChannel(kLFE, {10}); |
| |
| // Down-mix to 7.1.2 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kLrs7, kRrs7}, {{3000, 4000}}); |
| |
| // Down-mix to 5.1.2 as the lowest layer. |
| ConfigureOutputChannel({kL5, kR5}, {{1, 2}}); |
| ConfigureOutputChannel({kCentre}, {{3}}); |
| // Ls5 = Lss7 * alpha + Lrs7 * beta. |
| ConfigureOutputChannel({kLs5, kRs5}, {{3598, 5464}}); |
| ConfigureOutputChannel({kLtf2, kRtf2}, {{8, 9}}); |
| ConfigureOutputChannel({kLFE}, {{10}}); |
| |
| TestDownMixing({.alpha = 1, .beta = .866}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, S7ToS5DownMixerWithT4) { |
| ConfigureInputChannel(kL7, {1}); |
| ConfigureInputChannel(kR7, {2}); |
| ConfigureInputChannel(kCentre, {3}); |
| ConfigureInputChannel(kLss7, {1000}); |
| ConfigureInputChannel(kRss7, {2000}); |
| ConfigureInputChannel(kLrs7, {3000}); |
| ConfigureInputChannel(kRrs7, {4000}); |
| ConfigureInputChannel(kLtf4, {8}); |
| ConfigureInputChannel(kRtf4, {9}); |
| ConfigureInputChannel(kLtb4, {10}); |
| ConfigureInputChannel(kRtb4, {11}); |
| ConfigureInputChannel(kLFE, {12}); |
| |
| // Down-mix to 7.1.4 as the highest layer. The highest layer always matches |
| // the original input. |
| ConfigureOutputChannel({kLrs7, kRrs7}, {{3000, 4000}}); |
| |
| // Down-mix to 5.1.4 as the lowest layer. |
| ConfigureOutputChannel({kL5, kR5}, {{1, 2}}); |
| ConfigureOutputChannel({kCentre}, {{3}}); |
| // Ls5 = Lss7 * alpha + Lrs7 * beta. |
| ConfigureOutputChannel({kLs5, kRs5}, {{3598, 5464}}); |
| ConfigureOutputChannel({kLtf4, kRtf4}, {{8, 9}}); |
| ConfigureOutputChannel({kLtb4, kRtb4}, {{10, 11}}); |
| ConfigureOutputChannel({kLFE}, {{12}}); |
| |
| TestDownMixing({.alpha = 1, .beta = .866}, 1); |
| } |
| |
| TEST_F(DownMixingModuleTest, SixLayer7_1_4) { |
| ConfigureInputChannel(kL7, {1000}); |
| ConfigureInputChannel(kR7, {2000}); |
| ConfigureInputChannel(kCentre, {1000}); |
| ConfigureInputChannel(kLss7, {1000}); |
| ConfigureInputChannel(kRss7, {2000}); |
| ConfigureInputChannel(kLrs7, {3000}); |
| ConfigureInputChannel(kRrs7, {4000}); |
| ConfigureInputChannel(kLtf4, {1000}); |
| ConfigureInputChannel(kRtf4, {2000}); |
| ConfigureInputChannel(kLtb4, {1000}); |
| ConfigureInputChannel(kRtb4, {2000}); |
| ConfigureInputChannel(kLFE, {12}); |
| |
| // There are different paths to have six-layers, choose 7.1.2, 5.1.2, 3.1.2, |
| // stereo, mono to avoid dropping the height channels for as many steps as |
| // possible. |
| |
| // Down-mix to 7.1.4 as the sixth layer. |
| ConfigureOutputChannel({kLtb4, kRtb4}, {{1000, 2000}}); |
| |
| // Down-mix to 7.1.2 as the fifth layer. |
| ConfigureOutputChannel({kLrs7, kRrs7}, {{3000, 4000}}); |
| |
| // Down-mix to 5.1.2 as the fourth layer. |
| // Ls5 = Lss7 * alpha + Lrs7 * beta. |
| ConfigureOutputChannel({kLs5, kRs5}, {{3598, 5464}}); |
| |
| // Down-mix to 3.1.2 as the third layer. |
| ConfigureOutputChannel({kCentre}, {{1000}}); |
| // Ltf2 = Ltf4 + Ltb4 * gamma. |
| // Ltf3 = Ltf2 + Ls5 * w * delta. |
| ConfigureOutputChannel({kLtf3, kRtf3}, {{2644, 4914}}); |
| ConfigureOutputChannel({kLFE}, {{12}}); |
| |
| // Down-mix to stereo as the second layer. |
| // L5 = L7. |
| // L3 = L5 + Ls5 * delta. |
| // L2 = L3 + (C - 3 dB). |
| ConfigureOutputChannel({kL2}, {{4822}}); |
| |
| // Down=mix to mono as the first layer. |
| // R5 = R7. |
| // R3 = R5 + Rs5 * delta. |
| // R2 = R3 + (C - 3 dB). |
| // M = (L2 - 6 dB) + (R2 - 6 dB). |
| ConfigureOutputChannel({kMono}, {{6130}}); |
| |
| TestDownMixing( |
| {.alpha = 1, .beta = .866, .gamma = .866, .delta = .866, .w = 0.25}, 6); |
| } |
| |
| class DemixingModuleTest : public DemixingModuleTestBase, |
| public ::testing::Test { |
| public: |
| void ConfigureLosslessAudioFrameAndDecodedAudioFrame( |
| const std::list<ChannelLabel::Label>& labels, |
| const std::vector<std::vector<int32_t>>& pcm_samples, |
| DownMixingParams down_mixing_params = { |
| .alpha = 1, .beta = .866, .gamma = .866, .delta = .866, .w = 0.25}) { |
| // The substream ID itself does not matter. Generate a unique one. |
| const DecodedUleb128 substream_id = substream_id_to_labels_.size(); |
| substream_id_to_labels_[substream_id] = labels; |
| |
| // Configure a pair of audio frames and decoded audio frames. They share a |
| // lot of the same information for a lossless codec. |
| audio_frames_.push_back(AudioFrameWithData{ |
| .obu = AudioFrameObu(ObuHeader(), substream_id, {}), |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .pcm_samples = pcm_samples, |
| .down_mixing_params = down_mixing_params, |
| }); |
| |
| decoded_audio_frames_.push_back( |
| DecodedAudioFrame{.substream_id = substream_id, |
| .start_timestamp = kStartTimestamp, |
| .end_timestamp = kEndTimestamp, |
| .samples_to_trim_at_end = kZeroSamplesToTrimAtEnd, |
| .samples_to_trim_at_start = kZeroSamplesToTrimAtStart, |
| .decoded_samples = pcm_samples, |
| .down_mixing_params = down_mixing_params}); |
| |
| auto& expected_label_to_samples = |
| expected_id_to_labeled_decoded_frame_[kAudioElementId].label_to_samples; |
| // `raw_samples` is arranged in (time, channel axes). Arrange the samples |
| // associated with each channel by time. The demixing process never changes |
| // data for the input labels. |
| auto labels_iter = labels.begin(); |
| for (int channel = 0; channel < labels.size(); ++channel) { |
| auto& samples_for_channel = expected_label_to_samples[*labels_iter]; |
| |
| samples_for_channel.reserve(pcm_samples.size()); |
| for (auto tick : pcm_samples) { |
| samples_for_channel.push_back( |
| Int32ToNormalizedFloatingPoint<InternalSampleType>(tick[channel])); |
| } |
| labels_iter++; |
| } |
| } |
| |
| void ConfiguredExpectedDemixingChannelFrame( |
| ChannelLabel::Label label, |
| const std::vector<int32_t>& expected_demixed_samples) { |
| std::vector<InternalSampleType> expected_demixed_samples_as_internal_type; |
| expected_demixed_samples_as_internal_type.reserve( |
| expected_demixed_samples.size()); |
| for (int32_t sample : expected_demixed_samples) { |
| expected_demixed_samples_as_internal_type.push_back( |
| Int32ToNormalizedFloatingPoint<InternalSampleType>(sample)); |
| } |
| |
| // Configure the expected demixed channels. Typically the input `label` |
| // should have a "D_" prefix. |
| expected_id_to_labeled_decoded_frame_[kAudioElementId] |
| .label_to_samples[label] = expected_demixed_samples_as_internal_type; |
| } |
| |
| void TestLosslessDemixing(int expected_number_of_down_mixers) { |
| TestCreateDemixingModule(expected_number_of_down_mixers); |
| |
| const auto id_to_labeled_decoded_frame = |
| demixing_module_->DemixDecodedAudioSamples(decoded_audio_frames_); |
| ASSERT_THAT(id_to_labeled_decoded_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_decoded_frame->contains(kAudioElementId)); |
| |
| // Check that the demixed samples have the correct values. |
| const auto& actual_label_to_samples = |
| id_to_labeled_decoded_frame->at(kAudioElementId).label_to_samples; |
| |
| const auto& expected_label_to_samples = |
| expected_id_to_labeled_decoded_frame_[kAudioElementId].label_to_samples; |
| EXPECT_EQ(actual_label_to_samples.size(), expected_label_to_samples.size()); |
| for (const auto& [label, samples] : actual_label_to_samples) { |
| // Use `DoubleNear` with a tolerance because floating-point arithmetic |
| // introduces errors larger than allowed by `DoubleEq`. |
| constexpr double kErrorTolerance = 1e-14; |
| EXPECT_THAT(samples, Pointwise(DoubleNear(kErrorTolerance), |
| expected_label_to_samples.at(label))); |
| } |
| |
| // Also, since this is lossless, we expect demixing the original samples |
| // should give the same result. |
| const auto id_to_labeled_frame = |
| demixing_module_->DemixOriginalAudioSamples(audio_frames_); |
| ASSERT_THAT(id_to_labeled_frame, IsOk()); |
| ASSERT_TRUE(id_to_labeled_frame->contains(kAudioElementId)); |
| EXPECT_EQ(id_to_labeled_frame->at(kAudioElementId).label_to_samples, |
| actual_label_to_samples); |
| } |
| |
| protected: |
| std::list<AudioFrameWithData> audio_frames_; |
| std::list<DecodedAudioFrame> decoded_audio_frames_; |
| |
| IdLabeledFrameMap expected_id_to_labeled_decoded_frame_; |
| }; // namespace |
| |
| TEST(DemixingModule, DemixingOriginalAudioSamplesSucceedsWithEmptyInputs) { |
| const auto demixing_module = |
| DemixingModule::CreateForDownMixingAndReconstruction({}); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| EXPECT_THAT(demixing_module->DemixOriginalAudioSamples({}), |
| IsOkAndHolds(IsEmpty())); |
| } |
| |
| TEST(DemixingModule, DemixingDecodedAudioSamplesSucceedsWithEmptyInputs) { |
| const auto demixing_module = |
| DemixingModule::CreateForDownMixingAndReconstruction({}); |
| ASSERT_THAT(demixing_module, IsOk()); |
| |
| EXPECT_THAT(demixing_module->DemixDecodedAudioSamples({}), |
| IsOkAndHolds(IsEmpty())); |
| } |
| |
| TEST_F(DemixingModuleTest, AmbisonicsHasNoDemixers) { |
| ConfigureAudioFrameMetadata({kA0, kA1, kA2, kA3}); |
| |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kA0}, {{1}}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kA1}, {{1}}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kA2}, {{1}}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kA3}, {{1}}); |
| |
| TestLosslessDemixing(0); |
| } |
| |
| TEST_F(DemixingModuleTest, S1ToS2Demixer) { |
| // The highest layer is stereo. |
| ConfigureAudioFrameMetadata({kL2, kR2}); |
| |
| // Mono is the lowest layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kMono}, {{750}, {1500}}); |
| // Stereo is the next layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL2}, {{1000}, {2000}}); |
| |
| // Demixing recovers kDemixedR2 |
| // D_R2 = M - (L2 - 6 dB) + 6 dB. |
| ConfiguredExpectedDemixingChannelFrame(kDemixedR2, {500, 1000}); |
| |
| TestLosslessDemixing(1); |
| } |
| |
| TEST_F(DemixingModuleTest, |
| DemixOriginalAudioSamplesReturnsErrorIfAudioFrameIsMissingPcmSamples) { |
| ConfigureAudioFrameMetadata({kL2, kR2}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kMono}, {{750}, {1500}}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL2}, {{1000}, {2000}}); |
| IdLabeledFrameMap unused_id_to_labeled_frame, id_to_labeled_decoded_frame; |
| TestCreateDemixingModule(1); |
| // Destroy the raw samples. |
| audio_frames_.back().pcm_samples = std::nullopt; |
| |
| EXPECT_THAT(demixing_module_->DemixOriginalAudioSamples(audio_frames_), |
| Not(IsOk())); |
| } |
| |
| TEST_F(DemixingModuleTest, S2ToS3Demixer) { |
| // The highest layer is 3.1.2. |
| ConfigureAudioFrameMetadata({kL3, kR3, kCentre, kLtf3, kRtf3}); |
| |
| // Stereo is the lowest layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL2, kR2}, |
| {{70, 70}, {1700, 1700}}); |
| |
| // 3.1.2 as the next layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kCentre}, {{2000}, {1000}}); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame( |
| {kLtf3, kRtf3}, {{99999, 99998}, {99999, 99998}}); |
| |
| // L3/R3 get demixed from the lower layers. |
| // L3 = L2 - (C - 3 dB). |
| // R3 = R2 - (C - 3 dB). |
| ConfiguredExpectedDemixingChannelFrame(kDemixedL3, {-1344, 993}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedR3, {-1344, 993}); |
| |
| TestLosslessDemixing(1); |
| } |
| |
| TEST_F(DemixingModuleTest, S3ToS5AndTf2ToT2Demixers) { |
| // Adding a (valid) layer on top of 3.1.2 will always result in both S3ToS5 |
| // and Tf2ToT2 demixers. |
| // The highest layer is 5.1.2. |
| ConfigureAudioFrameMetadata({kL5, kR5, kCentre, kLtf2, kRtf2}); |
| |
| const DownMixingParams kDownMixingParams = {.delta = .866, .w = 0.25}; |
| |
| // 3.1.2 is the lowest layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL3, kR3}, {{18660, 28660}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kCentre}, {{100}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame( |
| {kLtf3, kRtf3}, {{1000, 2000}}, kDownMixingParams); |
| |
| // 5.1.2 as the next layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL5, kR5}, {{10000, 20000}}, |
| kDownMixingParams); |
| |
| // S3ToS5: Ls5/Rs5 get demixed from the lower layers. |
| // Ls5 = (1 / delta) * (L3 - L5). |
| // Rs5 = (1 / delta) * (R3 - R5). |
| ConfiguredExpectedDemixingChannelFrame(kDemixedLs5, {10000}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedRs5, {10000}); |
| |
| // Tf2ToT2: Ltf2/Rtf2 get demixed from the lower layers. |
| // Ltf2 = Ltf3 - w * (L3 - L5). |
| // Rtf2 = Rtf3 - w * (R3 - R5). |
| ConfiguredExpectedDemixingChannelFrame(kDemixedLtf2, {-1165}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedRtf2, {-165}); |
| |
| TestLosslessDemixing(2); |
| } |
| |
| TEST_F(DemixingModuleTest, S5ToS7Demixer) { |
| // The highest layer is 7.1.0. |
| ConfigureAudioFrameMetadata({kL7, kR7, kCentre, kLss7, kRss7, kLrs7, kRrs7}); |
| |
| const DownMixingParams kDownMixingParams = {.alpha = 0.866, .beta = .866}; |
| |
| // 5.1.0 is the lowest layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL5, kR5}, {{100, 100}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kLs5, kRs5}, {{7794, 7794}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kCentre}, {{100}}, |
| kDownMixingParams); |
| |
| // 7.1.0 as the next layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame( |
| {kLss7, kRss7}, {{1000, 2000}}, kDownMixingParams); |
| |
| // L7/R7 get demixed from the lower layers. |
| // L7 = R5. |
| // R7 = R5. |
| ConfiguredExpectedDemixingChannelFrame(kDemixedL7, {100}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedR7, {100}); |
| |
| // Lrs7/Rrs7 get demixed from the lower layers. |
| // Lrs7 = (1 / beta) * (Ls5 - alpha * Lss7). |
| // Rrs7 = (1 / beta) * (Rs5 - alpha * Rss7). |
| ConfiguredExpectedDemixingChannelFrame(kDemixedLrs7, {8000}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedRrs7, {7000}); |
| |
| TestLosslessDemixing(1); |
| } |
| |
| TEST_F(DemixingModuleTest, T2ToT4Demixer) { |
| // The highest layer is 5.1.4. |
| ConfigureAudioFrameMetadata({kL5, kR5, kCentre, kLtf4, kRtf4}); |
| |
| const DownMixingParams kDownMixingParams = {.gamma = .866}; |
| |
| // 5.1.2 is the lowest layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kL5, kR5}, {{100, 100}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kLs5, kRs5}, {{100, 100}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kCentre}, {{100}}, |
| kDownMixingParams); |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame( |
| {kLtf2, kRtf2}, {{8660, 17320}}, kDownMixingParams); |
| |
| // 5.1.4 as the next layer. |
| ConfigureLosslessAudioFrameAndDecodedAudioFrame({kLtf4, kRtf4}, {{866, 1732}}, |
| kDownMixingParams); |
| |
| // Ltb4/Rtb4 get demixed from the lower layers. |
| // Ltb4 = (1 / gamma) * (Ltf2 - Ltf4). |
| // Ttb4 = (1 / gamma) * (Ttf2 - Rtf4). |
| ConfiguredExpectedDemixingChannelFrame(kDemixedLtb4, {9000}); |
| ConfiguredExpectedDemixingChannelFrame(kDemixedRtb4, {18000}); |
| |
| TestLosslessDemixing(1); |
| } |
| |
| } // namespace |
| } // namespace iamf_tools |