src/backend/vaapi.rs - platform/system/cros-codecs - Git at Google

 // Copyright 2022 The ChromiumOS Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //! VAAPI backend for both stateless decoders and encoders.

 use std::collections::HashSet;
 use std::fmt::Debug;
 use std::os::fd::AsRawFd;

 use anyhow::anyhow;
 use byteorder::ByteOrder;
 use byteorder::LittleEndian;
 use libva::Display;
 use libva::VAConfigAttrib;
 use libva::VAConfigAttribType;

 use crate::utils::DmabufFrame;
 use crate::utils::UserPtrFrame;
 use crate::DecodedFormat;

 pub mod decoder;
 pub mod encoder;
 pub mod surface_pool;

 fn va_rt_format_to_string(va_rt_format: u32) -> String {
     String::from(match va_rt_format {
         libva::VA_RT_FORMAT_YUV420 => "YUV420",
         libva::VA_RT_FORMAT_YUV422 => "YUV422",
         libva::VA_RT_FORMAT_YUV444 => "YUV444",
         libva::VA_RT_FORMAT_YUV420_10 => "YUV420_10",
         libva::VA_RT_FORMAT_YUV420_12 => "YUV420_12",
         libva::VA_RT_FORMAT_YUV422_10 => "YUV422_10",
         libva::VA_RT_FORMAT_YUV422_12 => "YUV422_12",
         libva::VA_RT_FORMAT_YUV444_10 => "YUV444_10",
         libva::VA_RT_FORMAT_YUV444_12 => "YUV444_12",
         other => return format!("unknown VA rt_format {}", other),
     })
 }

 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 struct FormatMap {
     pub rt_format: u32,
     pub va_fourcc: u32,
     pub decoded_format: DecodedFormat,
 }

 /// Maps a given VA_RT_FORMAT to a compatible decoded format in an arbitrary
 /// preferred order.
 const FORMAT_MAP: [FormatMap; 10] = [
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV420,
         va_fourcc: libva::VA_FOURCC_NV12,
         decoded_format: DecodedFormat::NV12,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV420,
         va_fourcc: libva::VA_FOURCC_I420,
         decoded_format: DecodedFormat::I420,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV422,
         va_fourcc: libva::VA_FOURCC_422H,
         decoded_format: DecodedFormat::I422,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV444,
         va_fourcc: libva::VA_FOURCC_444P,
         decoded_format: DecodedFormat::I444,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV420_10,
         va_fourcc: libva::VA_FOURCC_P010,
         decoded_format: DecodedFormat::I010,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV420_12,
         va_fourcc: libva::VA_FOURCC_P012,
         decoded_format: DecodedFormat::I012,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV422_10,
         va_fourcc: libva::VA_FOURCC_Y210,
         decoded_format: DecodedFormat::I210,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV422_12,
         va_fourcc: libva::VA_FOURCC_Y212,
         decoded_format: DecodedFormat::I212,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV444_10,
         va_fourcc: libva::VA_FOURCC_Y410,
         decoded_format: DecodedFormat::I410,
     },
     FormatMap {
         rt_format: libva::VA_RT_FORMAT_YUV444_12,
         va_fourcc: libva::VA_FOURCC_Y412,
         decoded_format: DecodedFormat::I412,
     },
 ];

 /// Returns a set of supported decoded formats given `rt_format`
 fn supported_formats_for_rt_format(
     display: &Display,
     rt_format: u32,
     profile: i32,
     entrypoint: u32,
     image_formats: &[libva::VAImageFormat],
 ) -> anyhow::Result<HashSet<FormatMap>> {
     let mut attrs = vec![VAConfigAttrib {
         type_: VAConfigAttribType::VAConfigAttribRTFormat,
         value: 0,
     }];

     display.get_config_attributes(profile, entrypoint, &mut attrs)?;

     // See whether this RT_FORMAT is supported by the given VAProfile and
     // VAEntrypoint pair.
     if attrs[0].value == libva::VA_ATTRIB_NOT_SUPPORTED || attrs[0].value & rt_format == 0 {
         return Err(anyhow!(
             "rt_format {:?} not supported for profile {:?} and entrypoint {:?}",
             rt_format,
             profile,
             entrypoint
         ));
     }

     let mut supported_formats = HashSet::new();

     for format in FORMAT_MAP {
         if format.rt_format == rt_format {
             supported_formats.insert(format);
         }
     }

     // Only retain those that the hardware can actually map into.
     supported_formats.retain(|&entry| {
         image_formats
             .iter()
             .any(|fmt| fmt.fourcc == entry.va_fourcc)
     });

     Ok(supported_formats)
 }

 impl TryFrom<&libva::VAImageFormat> for DecodedFormat {
     type Error = anyhow::Error;

     fn try_from(value: &libva::VAImageFormat) -> Result<Self, Self::Error> {
         match value.fourcc {
             libva::VA_FOURCC_I420 => Ok(DecodedFormat::I420),
             libva::VA_FOURCC_NV12 => Ok(DecodedFormat::NV12),
             libva::VA_FOURCC_P010 => Ok(DecodedFormat::I010),
             libva::VA_FOURCC_P012 => Ok(DecodedFormat::I012),
             libva::VA_FOURCC_Y210 => Ok(DecodedFormat::I210),
             libva::VA_FOURCC_Y212 => Ok(DecodedFormat::I212),
             libva::VA_FOURCC_Y410 => Ok(DecodedFormat::I410),
             libva::VA_FOURCC_Y412 => Ok(DecodedFormat::I412),
             _ => Err(anyhow!("Unsupported format")),
         }
     }
 }

 /// Copies `src` into `dst` removing all padding and converting from biplanar to triplanar format.
 ///
 /// `useful_pixels` is the number of useful pixels in each sample, e.g. `10` for `P010`, `12` for
 /// `P012`, etc.
 ///
 /// This function is VAAPI-specific because of the unusual the source pixels are laid out: VAAPI
 /// writes the `useful_pixels` MSBs, but software generally expects the LSBs to contain the data.
 fn p01x_to_i01x(
     src: &[u8],
     dst: &mut [u8],
     useful_pixels: usize,
     width: usize,
     height: usize,
     strides: [usize; 3],
     offsets: [usize; 3],
 ) {
     let sample_shift = 16 - useful_pixels;

     // Copy Y.
     //
     // VAAPI's Y samples are two byte little endian with the bottom six bits ignored. We need to
     // convert that to two byte little endian with top 6 bits ignored.

     let src_y_lines = src[offsets[0]..]
         .chunks(strides[0])
         .map(|line| &line[..width * 2]);
     let dst_y_lines = dst.chunks_mut(width * 2);

     for (src_line, dst_line) in src_y_lines.zip(dst_y_lines).take(height) {
         for (src_y, dst_y) in src_line.chunks(2).zip(dst_line.chunks_mut(2)) {
             LittleEndian::write_u16(dst_y, LittleEndian::read_u16(src_y) >> sample_shift);
         }
     }

     let dst_u_offset = width * 2 * height;

     // Align width and height to 2 for UV plane.
     let width = if width % 2 == 1 { width + 1 } else { width };
     let height = if height % 2 == 1 { height + 1 } else { height };
     // 1 sample per 4 pixels, but we have two components per line so width remains as-is.
     let height = height / 2;

     let dst_u_size = width * height;

     // Copy U and V and deinterleave into different planes.
     //
     // We need to perform the same bit shift as luma, but also to de-interleave the data.
     let src_uv_lines = src[offsets[1]..]
         .chunks(strides[1])
         .map(|line| &line[..width * 2]);
     let (dst_u_plane, dst_v_plane) = dst[dst_u_offset..].split_at_mut(dst_u_size);
     let dst_u_lines = dst_u_plane.chunks_mut(width);
     let dst_v_lines = dst_v_plane.chunks_mut(width);
     for (src_line, (dst_u_line, dst_v_line)) in
         src_uv_lines.zip(dst_u_lines.zip(dst_v_lines)).take(height)
     {
         for ((src_u, src_v), (dst_u, dst_v)) in src_line
             .chunks(4)
             .map(|chunk| (&chunk[0..2], &chunk[2..4]))
             .zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2)))
         {
             LittleEndian::write_u16(dst_u, LittleEndian::read_u16(src_u) >> sample_shift);
             LittleEndian::write_u16(dst_v, LittleEndian::read_u16(src_v) >> sample_shift);
         }
     }
 }

 /// Copies `src` into `dst` as I21x, removing all padding and changing the layout from packed to
 /// triplanar.
 ///
 /// `useful_pixels` is the number of useful pixels in each sample, e.g. `10` for `Y210` or `16` for
 /// `Y216`.
 ///
 /// This function is VAAPI-specific because of the unusual the source pixels are laid out: VAAPI
 /// writes the `useful_pixels` MSBs, but software generally expects the LSBs to contain the data.
 ///
 /// WARNING: this function could not be tested for lack of supporting hardware.
 fn y21x_to_i21x(
     src: &[u8],
     dst: &mut [u8],
     useful_pixels: usize,
     width: usize,
     height: usize,
     strides: [usize; 3],
     offsets: [usize; 3],
 ) {
     let sample_shift = 16 - useful_pixels;
     // Align width to 2 for U and V planes and divide by 2.
     // This should not be necessary as the sampling method requires that width is a multiple of 2
     // to begin with.
     let uv_width = if width % 2 == 1 { width + 1 } else { width } / 2;

     // YUYV representation, i.e. 4 16-bit words per two Y samples meaning we have 4 * width bytes
     // of data per line.
     let src_lines = src[offsets[0]..]
         .chunks(strides[0])
         .map(|line| &line[..width * 4]);

     let dst_y_size = width * 2 * height;
     let dst_u_size = uv_width * 2 * height;

     let (dst_y_plane, dst_uv_planes) = dst.split_at_mut(dst_y_size);
     let (dst_u_plane, dst_v_plane) = dst_uv_planes.split_at_mut(dst_u_size);
     let dst_y_lines = dst_y_plane.chunks_mut(width * 2);
     let dst_u_lines = dst_u_plane.chunks_mut(uv_width * 2);
     let dst_v_lines = dst_v_plane.chunks_mut(uv_width * 2);

     for (src_line, (dst_y_line, (dst_u_line, dst_v_line))) in src_lines
         .zip(dst_y_lines.zip(dst_u_lines.zip(dst_v_lines)))
         .take(height)
     {
         for (src, (dst_y, (dst_u, dst_v))) in src_line.chunks(8).zip(
             dst_y_line
                 .chunks_mut(4)
                 .zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2))),
         ) {
             let y0 = LittleEndian::read_u16(&src[0..2]) >> sample_shift;
             let u = LittleEndian::read_u16(&src[2..4]) >> sample_shift;
             let y1 = LittleEndian::read_u16(&src[4..6]) >> sample_shift;
             let v = LittleEndian::read_u16(&src[6..8]) >> sample_shift;

             LittleEndian::write_u16(&mut dst_y[0..2], y0);
             LittleEndian::write_u16(&mut dst_y[2..4], y1);
             LittleEndian::write_u16(dst_u, u);
             LittleEndian::write_u16(dst_v, v);
         }
     }
 }

 /// Copies `src` into `dst` as I412, removing all padding and changing the layout from packed to
 /// triplanar. Also drops the alpha channel.
 ///
 /// This function is VAAPI-specific because the samples need to be rolled somehow...
 fn y412_to_i412(
     src: &[u8],
     dst: &mut [u8],
     width: usize,
     height: usize,
     strides: [usize; 3],
     offsets: [usize; 3],
 ) {
     let src_lines = src[offsets[0]..]
         .chunks(strides[0])
         .map(|line| &line[..width * 8]);

     let dst_y_size = width * 2 * height;
     let dst_u_size = width * 2 * height;

     let (dst_y_plane, dst_uv_planes) = dst.split_at_mut(dst_y_size);
     let (dst_u_plane, dst_v_plane) = dst_uv_planes.split_at_mut(dst_u_size);
     let dst_y_lines = dst_y_plane.chunks_mut(width * 2);
     let dst_u_lines = dst_u_plane.chunks_mut(width * 2);
     let dst_v_lines = dst_v_plane.chunks_mut(width * 2);

     for (src_line, (dst_y_line, (dst_u_line, dst_v_line))) in src_lines
         .zip(dst_y_lines.zip(dst_u_lines.zip(dst_v_lines)))
         .take(height)
     {
         for (src, (dst_y, (dst_u, dst_v))) in src_line.chunks(8).zip(
             dst_y_line
                 .chunks_mut(2)
                 .zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2))),
         ) {
             let y = LittleEndian::read_u16(&src[2..4]);
             let u = LittleEndian::read_u16(&src[0..2]);
             let v = LittleEndian::read_u16(&src[4..6]);
             // Why is that rotate_right neeed??
             LittleEndian::write_u16(dst_y, y.rotate_right(4));
             LittleEndian::write_u16(dst_u, u.rotate_right(4));
             LittleEndian::write_u16(dst_v, v.rotate_right(4));
         }
     }
 }

 impl libva::ExternalBufferDescriptor for UserPtrFrame {
     const MEMORY_TYPE: libva::MemoryType = libva::MemoryType::UserPtr;
     type DescriptorAttribute = libva::VASurfaceAttribExternalBuffers;

     fn va_surface_attribute(&mut self) -> Self::DescriptorAttribute {
         let pitches = self
             .layout
             .planes
             .iter()
             .map(|p| p.stride as u32)
             .chain(std::iter::repeat(0))
             .take(4)
             .collect::<Vec<_>>()
             .try_into()
             .unwrap();
         let offsets = self
             .layout
             .planes
             .iter()
             .map(|p| p.offset as u32)
             .chain(std::iter::repeat(0))
             .take(4)
             .collect::<Vec<_>>()
             .try_into()
             .unwrap();

         libva::VASurfaceAttribExternalBuffers {
             pixel_format: self.layout.format.0.into(),
             width: self.layout.size.width,
             height: self.layout.size.height,
             data_size: self.mem_layout.size() as u32,
             num_planes: self.layout.planes.len() as u32,
             pitches,
             offsets,
             buffers: self.buffers.as_mut_ptr() as *mut _,
             num_buffers: self.buffers.len() as u32,
             flags: 0,
             private_data: std::ptr::null_mut(),
         }
     }
 }

 impl libva::ExternalBufferDescriptor for DmabufFrame {
     const MEMORY_TYPE: libva::MemoryType = libva::MemoryType::DrmPrime2;
     type DescriptorAttribute = libva::VADRMPRIMESurfaceDescriptor;

     fn va_surface_attribute(&mut self) -> Self::DescriptorAttribute {
         let objects = self
             .fds
             .iter()
             .map(|fd| libva::VADRMPRIMESurfaceDescriptorObject {
                 fd: fd.as_raw_fd(),
                 size: nix::sys::stat::fstat(fd.as_raw_fd())
                     .map(|stat| stat.st_size as u32)
                     // If we don't have the information about the plane fd size, fallback to 0.
                     // Libva seems to be *sometimes* "happy" with zero.
                     .unwrap_or(0),
                 // TODO should the descriptor be moved to individual objects?
                 drm_format_modifier: self.layout.format.1,
             })
             .chain(std::iter::repeat(Default::default()))
             .take(4)
             .collect::<Vec<_>>()
             .try_into()
             .unwrap();

         let layers = [
             libva::VADRMPRIMESurfaceDescriptorLayer {
                 drm_format: self.layout.format.0.into(),
                 num_planes: self.layout.planes.len() as u32,
                 object_index: [0, 0, 0, 0],
                 offset: self
                     .layout
                     .planes
                     .iter()
                     .map(|p| p.offset as u32)
                     .chain(std::iter::repeat(0))
                     .take(4)
                     .collect::<Vec<_>>()
                     .try_into()
                     .unwrap(),
                 pitch: self
                     .layout
                     .planes
                     .iter()
                     .map(|p| p.stride as u32)
                     .chain(std::iter::repeat(0))
                     .take(4)
                     .collect::<Vec<_>>()
                     .try_into()
                     .unwrap(),
             },
             Default::default(),
             Default::default(),
             Default::default(),
         ];

         libva::VADRMPRIMESurfaceDescriptor {
             // TODO should we match and use VA_FOURCC_* here?
             fourcc: self.layout.format.0.into(),
             width: self.layout.size.width,
             height: self.layout.size.height,
             num_objects: 1,
             objects,
             num_layers: 1,
             layers,
         }
     }
 }
	// Copyright 2022 The ChromiumOS Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	//! VAAPI backend for both stateless decoders and encoders.

	use std::collections::HashSet;
	use std::fmt::Debug;
	use std::os::fd::AsRawFd;

	use anyhow::anyhow;
	use byteorder::ByteOrder;
	use byteorder::LittleEndian;
	use libva::Display;
	use libva::VAConfigAttrib;
	use libva::VAConfigAttribType;

	use crate::utils::DmabufFrame;
	use crate::utils::UserPtrFrame;
	use crate::DecodedFormat;

	pub mod decoder;
	pub mod encoder;
	pub mod surface_pool;

	fn va_rt_format_to_string(va_rt_format: u32) -> String {
	String::from(match va_rt_format {
	libva::VA_RT_FORMAT_YUV420 => "YUV420",
	libva::VA_RT_FORMAT_YUV422 => "YUV422",
	libva::VA_RT_FORMAT_YUV444 => "YUV444",
	libva::VA_RT_FORMAT_YUV420_10 => "YUV420_10",
	libva::VA_RT_FORMAT_YUV420_12 => "YUV420_12",
	libva::VA_RT_FORMAT_YUV422_10 => "YUV422_10",
	libva::VA_RT_FORMAT_YUV422_12 => "YUV422_12",
	libva::VA_RT_FORMAT_YUV444_10 => "YUV444_10",
	libva::VA_RT_FORMAT_YUV444_12 => "YUV444_12",
	other => return format!("unknown VA rt_format {}", other),
	})
	}

	#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
	struct FormatMap {
	pub rt_format: u32,
	pub va_fourcc: u32,
	pub decoded_format: DecodedFormat,
	}

	/// Maps a given VA_RT_FORMAT to a compatible decoded format in an arbitrary
	/// preferred order.
	const FORMAT_MAP: [FormatMap; 10] = [
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV420,
	va_fourcc: libva::VA_FOURCC_NV12,
	decoded_format: DecodedFormat::NV12,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV420,
	va_fourcc: libva::VA_FOURCC_I420,
	decoded_format: DecodedFormat::I420,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV422,
	va_fourcc: libva::VA_FOURCC_422H,
	decoded_format: DecodedFormat::I422,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV444,
	va_fourcc: libva::VA_FOURCC_444P,
	decoded_format: DecodedFormat::I444,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV420_10,
	va_fourcc: libva::VA_FOURCC_P010,
	decoded_format: DecodedFormat::I010,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV420_12,
	va_fourcc: libva::VA_FOURCC_P012,
	decoded_format: DecodedFormat::I012,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV422_10,
	va_fourcc: libva::VA_FOURCC_Y210,
	decoded_format: DecodedFormat::I210,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV422_12,
	va_fourcc: libva::VA_FOURCC_Y212,
	decoded_format: DecodedFormat::I212,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV444_10,
	va_fourcc: libva::VA_FOURCC_Y410,
	decoded_format: DecodedFormat::I410,
	},
	FormatMap {
	rt_format: libva::VA_RT_FORMAT_YUV444_12,
	va_fourcc: libva::VA_FOURCC_Y412,
	decoded_format: DecodedFormat::I412,
	},
	];

	/// Returns a set of supported decoded formats given `rt_format`
	fn supported_formats_for_rt_format(
	display: &Display,
	rt_format: u32,
	profile: i32,
	entrypoint: u32,
	image_formats: &[libva::VAImageFormat],
	) -> anyhow::Result<HashSet<FormatMap>> {
	let mut attrs = vec![VAConfigAttrib {
	type_: VAConfigAttribType::VAConfigAttribRTFormat,
	value: 0,
	}];

	display.get_config_attributes(profile, entrypoint, &mut attrs)?;

	// See whether this RT_FORMAT is supported by the given VAProfile and
	// VAEntrypoint pair.
	if attrs[0].value == libva::VA_ATTRIB_NOT_SUPPORTED \|\| attrs[0].value & rt_format == 0 {
	return Err(anyhow!(
	"rt_format {:?} not supported for profile {:?} and entrypoint {:?}",
	rt_format,
	profile,
	entrypoint
	));
	}

	let mut supported_formats = HashSet::new();

	for format in FORMAT_MAP {
	if format.rt_format == rt_format {
	supported_formats.insert(format);
	}
	}

	// Only retain those that the hardware can actually map into.
	supported_formats.retain(\|&entry\| {
	image_formats
	.iter()
	.any(\|fmt\| fmt.fourcc == entry.va_fourcc)
	});

	Ok(supported_formats)
	}

	impl TryFrom<&libva::VAImageFormat> for DecodedFormat {
	type Error = anyhow::Error;

	fn try_from(value: &libva::VAImageFormat) -> Result<Self, Self::Error> {
	match value.fourcc {
	libva::VA_FOURCC_I420 => Ok(DecodedFormat::I420),
	libva::VA_FOURCC_NV12 => Ok(DecodedFormat::NV12),
	libva::VA_FOURCC_P010 => Ok(DecodedFormat::I010),
	libva::VA_FOURCC_P012 => Ok(DecodedFormat::I012),
	libva::VA_FOURCC_Y210 => Ok(DecodedFormat::I210),
	libva::VA_FOURCC_Y212 => Ok(DecodedFormat::I212),
	libva::VA_FOURCC_Y410 => Ok(DecodedFormat::I410),
	libva::VA_FOURCC_Y412 => Ok(DecodedFormat::I412),
	_ => Err(anyhow!("Unsupported format")),
	}
	}
	}

	/// Copies `src` into `dst` removing all padding and converting from biplanar to triplanar format.
	///
	/// `useful_pixels` is the number of useful pixels in each sample, e.g. `10` for `P010`, `12` for
	/// `P012`, etc.
	///
	/// This function is VAAPI-specific because of the unusual the source pixels are laid out: VAAPI
	/// writes the `useful_pixels` MSBs, but software generally expects the LSBs to contain the data.
	fn p01x_to_i01x(
	src: &[u8],
	dst: &mut [u8],
	useful_pixels: usize,
	width: usize,
	height: usize,
	strides: [usize; 3],
	offsets: [usize; 3],
	) {
	let sample_shift = 16 - useful_pixels;

	// Copy Y.
	//
	// VAAPI's Y samples are two byte little endian with the bottom six bits ignored. We need to
	// convert that to two byte little endian with top 6 bits ignored.

	let src_y_lines = src[offsets[0]..]
	.chunks(strides[0])
	.map(\|line\| &line[..width * 2]);
	let dst_y_lines = dst.chunks_mut(width * 2);

	for (src_line, dst_line) in src_y_lines.zip(dst_y_lines).take(height) {
	for (src_y, dst_y) in src_line.chunks(2).zip(dst_line.chunks_mut(2)) {
	LittleEndian::write_u16(dst_y, LittleEndian::read_u16(src_y) >> sample_shift);
	}
	}

	let dst_u_offset = width * 2 * height;

	// Align width and height to 2 for UV plane.
	let width = if width % 2 == 1 { width + 1 } else { width };
	let height = if height % 2 == 1 { height + 1 } else { height };
	// 1 sample per 4 pixels, but we have two components per line so width remains as-is.
	let height = height / 2;

	let dst_u_size = width * height;

	// Copy U and V and deinterleave into different planes.
	//
	// We need to perform the same bit shift as luma, but also to de-interleave the data.
	let src_uv_lines = src[offsets[1]..]
	.chunks(strides[1])
	.map(\|line\| &line[..width * 2]);
	let (dst_u_plane, dst_v_plane) = dst[dst_u_offset..].split_at_mut(dst_u_size);
	let dst_u_lines = dst_u_plane.chunks_mut(width);
	let dst_v_lines = dst_v_plane.chunks_mut(width);
	for (src_line, (dst_u_line, dst_v_line)) in
	src_uv_lines.zip(dst_u_lines.zip(dst_v_lines)).take(height)
	{
	for ((src_u, src_v), (dst_u, dst_v)) in src_line
	.chunks(4)
	.map(\|chunk\| (&chunk[0..2], &chunk[2..4]))
	.zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2)))
	{
	LittleEndian::write_u16(dst_u, LittleEndian::read_u16(src_u) >> sample_shift);
	LittleEndian::write_u16(dst_v, LittleEndian::read_u16(src_v) >> sample_shift);
	}
	}
	}

	/// Copies `src` into `dst` as I21x, removing all padding and changing the layout from packed to
	/// triplanar.
	///
	/// `useful_pixels` is the number of useful pixels in each sample, e.g. `10` for `Y210` or `16` for
	/// `Y216`.
	///
	/// This function is VAAPI-specific because of the unusual the source pixels are laid out: VAAPI
	/// writes the `useful_pixels` MSBs, but software generally expects the LSBs to contain the data.
	///
	/// WARNING: this function could not be tested for lack of supporting hardware.
	fn y21x_to_i21x(
	src: &[u8],
	dst: &mut [u8],
	useful_pixels: usize,
	width: usize,
	height: usize,
	strides: [usize; 3],
	offsets: [usize; 3],
	) {
	let sample_shift = 16 - useful_pixels;
	// Align width to 2 for U and V planes and divide by 2.
	// This should not be necessary as the sampling method requires that width is a multiple of 2
	// to begin with.
	let uv_width = if width % 2 == 1 { width + 1 } else { width } / 2;

	// YUYV representation, i.e. 4 16-bit words per two Y samples meaning we have 4 * width bytes
	// of data per line.
	let src_lines = src[offsets[0]..]
	.chunks(strides[0])
	.map(\|line\| &line[..width * 4]);

	let dst_y_size = width * 2 * height;
	let dst_u_size = uv_width * 2 * height;

	let (dst_y_plane, dst_uv_planes) = dst.split_at_mut(dst_y_size);
	let (dst_u_plane, dst_v_plane) = dst_uv_planes.split_at_mut(dst_u_size);
	let dst_y_lines = dst_y_plane.chunks_mut(width * 2);
	let dst_u_lines = dst_u_plane.chunks_mut(uv_width * 2);
	let dst_v_lines = dst_v_plane.chunks_mut(uv_width * 2);

	for (src_line, (dst_y_line, (dst_u_line, dst_v_line))) in src_lines
	.zip(dst_y_lines.zip(dst_u_lines.zip(dst_v_lines)))
	.take(height)
	{
	for (src, (dst_y, (dst_u, dst_v))) in src_line.chunks(8).zip(
	dst_y_line
	.chunks_mut(4)
	.zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2))),
	) {
	let y0 = LittleEndian::read_u16(&src[0..2]) >> sample_shift;
	let u = LittleEndian::read_u16(&src[2..4]) >> sample_shift;
	let y1 = LittleEndian::read_u16(&src[4..6]) >> sample_shift;
	let v = LittleEndian::read_u16(&src[6..8]) >> sample_shift;

	LittleEndian::write_u16(&mut dst_y[0..2], y0);
	LittleEndian::write_u16(&mut dst_y[2..4], y1);
	LittleEndian::write_u16(dst_u, u);
	LittleEndian::write_u16(dst_v, v);
	}
	}
	}

	/// Copies `src` into `dst` as I412, removing all padding and changing the layout from packed to
	/// triplanar. Also drops the alpha channel.
	///
	/// This function is VAAPI-specific because the samples need to be rolled somehow...
	fn y412_to_i412(
	src: &[u8],
	dst: &mut [u8],
	width: usize,
	height: usize,
	strides: [usize; 3],
	offsets: [usize; 3],
	) {
	let src_lines = src[offsets[0]..]
	.chunks(strides[0])
	.map(\|line\| &line[..width * 8]);

	let dst_y_size = width * 2 * height;
	let dst_u_size = width * 2 * height;

	let (dst_y_plane, dst_uv_planes) = dst.split_at_mut(dst_y_size);
	let (dst_u_plane, dst_v_plane) = dst_uv_planes.split_at_mut(dst_u_size);
	let dst_y_lines = dst_y_plane.chunks_mut(width * 2);
	let dst_u_lines = dst_u_plane.chunks_mut(width * 2);
	let dst_v_lines = dst_v_plane.chunks_mut(width * 2);

	for (src_line, (dst_y_line, (dst_u_line, dst_v_line))) in src_lines
	.zip(dst_y_lines.zip(dst_u_lines.zip(dst_v_lines)))
	.take(height)
	{
	for (src, (dst_y, (dst_u, dst_v))) in src_line.chunks(8).zip(
	dst_y_line
	.chunks_mut(2)
	.zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2))),
	) {
	let y = LittleEndian::read_u16(&src[2..4]);
	let u = LittleEndian::read_u16(&src[0..2]);
	let v = LittleEndian::read_u16(&src[4..6]);
	// Why is that rotate_right neeed??
	LittleEndian::write_u16(dst_y, y.rotate_right(4));
	LittleEndian::write_u16(dst_u, u.rotate_right(4));
	LittleEndian::write_u16(dst_v, v.rotate_right(4));
	}
	}
	}

	impl libva::ExternalBufferDescriptor for UserPtrFrame {
	const MEMORY_TYPE: libva::MemoryType = libva::MemoryType::UserPtr;
	type DescriptorAttribute = libva::VASurfaceAttribExternalBuffers;

	fn va_surface_attribute(&mut self) -> Self::DescriptorAttribute {
	let pitches = self
	.layout
	.planes
	.iter()
	.map(\|p\| p.stride as u32)
	.chain(std::iter::repeat(0))
	.take(4)
	.collect::<Vec<_>>()
	.try_into()
	.unwrap();
	let offsets = self
	.layout
	.planes
	.iter()
	.map(\|p\| p.offset as u32)
	.chain(std::iter::repeat(0))
	.take(4)
	.collect::<Vec<_>>()
	.try_into()
	.unwrap();

	libva::VASurfaceAttribExternalBuffers {
	pixel_format: self.layout.format.0.into(),
	width: self.layout.size.width,
	height: self.layout.size.height,
	data_size: self.mem_layout.size() as u32,
	num_planes: self.layout.planes.len() as u32,
	pitches,
	offsets,
	buffers: self.buffers.as_mut_ptr() as *mut _,
	num_buffers: self.buffers.len() as u32,
	flags: 0,
	private_data: std::ptr::null_mut(),
	}
	}
	}

	impl libva::ExternalBufferDescriptor for DmabufFrame {
	const MEMORY_TYPE: libva::MemoryType = libva::MemoryType::DrmPrime2;
	type DescriptorAttribute = libva::VADRMPRIMESurfaceDescriptor;

	fn va_surface_attribute(&mut self) -> Self::DescriptorAttribute {
	let objects = self
	.fds
	.iter()
	.map(\|fd\| libva::VADRMPRIMESurfaceDescriptorObject {
	fd: fd.as_raw_fd(),
	size: nix::sys::stat::fstat(fd.as_raw_fd())
	.map(\|stat\| stat.st_size as u32)
	// If we don't have the information about the plane fd size, fallback to 0.
	// Libva seems to be sometimes "happy" with zero.
	.unwrap_or(0),
	// TODO should the descriptor be moved to individual objects?
	drm_format_modifier: self.layout.format.1,
	})
	.chain(std::iter::repeat(Default::default()))
	.take(4)
	.collect::<Vec<_>>()
	.try_into()
	.unwrap();

	let layers = [
	libva::VADRMPRIMESurfaceDescriptorLayer {
	drm_format: self.layout.format.0.into(),
	num_planes: self.layout.planes.len() as u32,
	object_index: [0, 0, 0, 0],
	offset: self
	.layout
	.planes
	.iter()
	.map(\|p\| p.offset as u32)
	.chain(std::iter::repeat(0))
	.take(4)
	.collect::<Vec<_>>()
	.try_into()
	.unwrap(),
	pitch: self
	.layout
	.planes
	.iter()
	.map(\|p\| p.stride as u32)
	.chain(std::iter::repeat(0))
	.take(4)
	.collect::<Vec<_>>()
	.try_into()
	.unwrap(),
	},
	Default::default(),
	Default::default(),
	Default::default(),
	];

	libva::VADRMPRIMESurfaceDescriptor {
	// TODO should we match and use VA_FOURCC_* here?
	fourcc: self.layout.format.0.into(),
	width: self.layout.size.width,
	height: self.layout.size.height,
	num_objects: 1,
	objects,
	num_layers: 1,
	layers,
	}
	}
	}