Update libjpeg-turbo to r856. BUG=132952 TEST=none Review URL: https://chromiumcodereview.appspot.com/10700197 git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo@147403 4ff67af0-8c30-449e-8e8b-ad334ec8d88c

commit: 11e6ee95ca9a40fe6b86a1cd23a9fbfd7d19c2bd [log] [tgz]
author: [email protected] <[email protected]@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> Thu Jul 19 06:04:44 2012 +0000
committer: [email protected] <[email protected]@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> Thu Jul 19 06:04:44 2012 +0000
tree: 2eec0543260b716b3862c8c05100bcd7ab6f833c
parent: cd3e30f64064274b17a99bb93e20a7dad2703bf0 [diff]
diff --git a/ChangeLog.txt b/ChangeLog.txt
index 98481ec..877993f 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt

@@ -5,31 +5,65 @@
 11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing.  Note that the IDCT will
 not be SIMD-accelerated when using any of these new scaling factors.
 
-[2] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
-platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
-platforms.
-
-[3] Creating or decoding a JPEG file that uses the RGB colorspace should now
-properly work when the input or output colorspace is one of the libjpeg-turbo
-colorspace extensions.
-
-[4] When libjpeg-turbo was built without SIMD support and merged (non-fancy)
-upsampling was used along with an alpha-enabled colorspace during
-decompression, the unused byte of the decompressed pixels was not being set to
-0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
-correct behavior of the colorspace extensions when merged upsampling is used.
-
-[5] The TurboJPEG dynamic library is now versioned.  It was not strictly
+[2] The TurboJPEG dynamic library is now versioned.  It was not strictly
 necessary to do so, because TurboJPEG uses versioned symbols, and if a function
 changes in an ABI-incompatible way, that function is renamed and a legacy
 function is provided to maintain backward compatibility.  However, certain
 Linux distro maintainers will blindly reject any library that is not versioned,
 so this was an attempt to make them happy.
 
-[6] Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+[3] Extended the TurboJPEG Java API so that it can be used to decompress a
+JPEG image into an arbitrary position in a large output buffer.
+
+[4] The tjDecompressToYUV() function now supports the TJFLAG_FASTDCT flag.
+
+
+1.2.1
+=====
+
+[1] Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+[2] When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+[3] Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
 upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
 calling conventions.
 
+[4] Fixed a regression caused by 1.2.0[6] whereby decompressing corrupt JPEG
+images (specifically, images in which the component count was erroneously set
+to a large value) would cause libjpeg-turbo to segfault.
+
+[5] Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors.  The MASKMOVDQU instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular.  Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+[6] Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+[7] Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+[8] Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+[9] Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
 
 1.2.0
 =====

diff --git a/README.chromium b/README.chromium
index bbc3eba..fd219c7 100644
--- a/README.chromium
+++ b/README.chromium

@@ -6,7 +6,7 @@
 
 Description:
 This consists of three components:
-* A partial copy of libjpeg-turbo 1.2.80 (r830);
+* A partial copy of libjpeg-turbo 1.2.80 (r856);
 * A build file (libjpeg.gyp), and;
 * Patched header files used by Chromium.
 

diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm
index 522fc5a..838f7d8 100644
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm

@@ -1,7 +1,7 @@
 ;
 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
 ;
-; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB
 ; Copyright 2009 D. R. Commander
 ;
 ; Based on
@@ -251,17 +251,13 @@
 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -271,26 +267,23 @@
 	jmp	near .columnloop
 
 .column_st32:
-	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
 	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	rcx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	rcx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store the lower 8 bytes of xmmA to the output when it has enough
 	; space.
 	cmp	rcx, byte SIZEOF_MMWORD
@@ -324,47 +317,6 @@
 	test	rcx, rcx
 	jz	short .nextrow
 	mov	BYTE [rdi], al
-%else
-	mov	rax,rcx
-	xor	rcx, byte 0x0F
-	shl	rcx, 2
-	movd	xmmB,ecx
-	psrlq	xmmH,4
-	pcmpeqb	xmmE,xmmE
-	psrlq	xmmH,xmmB
-	psrlq	xmmE,xmmB
-	punpcklbw xmmE,xmmH
-	; ----------------
-	mov	rcx,rdi
-	and	rcx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	add	rax,rcx
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,rcx
-	movdqa	xmmG,xmmA
-	movdqa	xmmC,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmD,ecx
-	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmF,ecx
-	psllq	xmmA,xmmF
-	psllq	xmmE,xmmF
-	jmp	short .adj0
-.adj1:	neg	ecx
-	movd	xmmF,ecx
-	psrlq	xmmA,xmmF
-	psrlq	xmmE,xmmF
-	psllq	xmmG,xmmD
-	psllq	xmmC,xmmD
-	por	xmmA,xmmG
-	por	xmmE,xmmC
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -409,19 +361,14 @@
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -431,25 +378,22 @@
 	jmp	near .columnloop
 
 .column_st32:
-	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	rcx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store two pixels (8 bytes) of xmmA to the output when it has enough
 	; space.
 	cmp	rcx, byte SIZEOF_XMMWORD/8
@@ -464,47 +408,6 @@
 	test	rcx, rcx
 	jz	short .nextrow
 	movd	DWORD [rdi], xmmA
-%else
-	cmp	rcx, byte SIZEOF_XMMWORD/16
-	jb	near .nextrow
-	mov	rax,rcx
-	xor	rcx, byte 0x03
-	inc	rcx
-	shl	rcx, 4
-	movd	xmmF,ecx
-	psrlq	xmmE,xmmF
-	punpcklbw xmmE,xmmE
-	; ----------------
-	mov	rcx,rdi
-	and	rcx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmB,xmmA
-	movdqa	xmmG,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmC,ecx
-	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmH,ecx
-	psllq	xmmA,xmmH
-	psllq	xmmE,xmmH
-	jmp	short .adj0
-.adj1:	neg	rcx
-	movd	xmmH,ecx
-	psrlq	xmmA,xmmH
-	psrlq	xmmE,xmmH
-	psllq	xmmB,xmmC
-	psllq	xmmG,xmmC
-	por	xmmA,xmmB
-	por	xmmE,xmmG
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 

diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm
index 4e6c22e..7d5d9f0 100644
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm

@@ -1,7 +1,7 @@
 ;
 ; jdclrss2.asm - colorspace conversion (SSE2)
 ;
-; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -262,17 +262,13 @@
 	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -283,26 +279,23 @@
 	alignx	16,7
 
 .column_st32:
-	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
 	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	ecx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	ecx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store the lower 8 bytes of xmmA to the output when it has enough
 	; space.
 	cmp	ecx, byte SIZEOF_MMWORD
@@ -336,47 +329,6 @@
 	test	ecx, ecx
 	jz	short .nextrow
 	mov	BYTE [edi], al
-%else
-	mov	eax,ecx
-	xor	ecx, byte 0x0F
-	shl	ecx, 2
-	movd	xmmB,ecx
-	psrlq	xmmH,4
-	pcmpeqb	xmmE,xmmE
-	psrlq	xmmH,xmmB
-	psrlq	xmmE,xmmB
-	punpcklbw xmmE,xmmH
-	; ----------------
-	mov	ecx,edi
-	and	ecx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	add	eax,ecx
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmG,xmmA
-	movdqa	xmmC,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmD,ecx
-	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmF,ecx
-	psllq	xmmA,xmmF
-	psllq	xmmE,xmmF
-	jmp	short .adj0
-.adj1:	neg	ecx
-	movd	xmmF,ecx
-	psrlq	xmmA,xmmF
-	psrlq	xmmE,xmmF
-	psllq	xmmG,xmmD
-	psllq	xmmC,xmmD
-	por	xmmA,xmmG
-	por	xmmE,xmmC
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -421,19 +373,14 @@
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .nextrow
 
@@ -444,25 +391,22 @@
 	alignx	16,7
 
 .column_st32:
-	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	ecx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store two pixels (8 bytes) of xmmA to the output when it has enough
 	; space.
 	cmp	ecx, byte SIZEOF_XMMWORD/8
@@ -477,47 +421,6 @@
 	test	ecx, ecx
 	jz	short .nextrow
 	movd	DWORD [edi], xmmA
-%else
-	cmp	ecx, byte SIZEOF_XMMWORD/16
-	jb	short .nextrow
-	mov	eax,ecx
-	xor	ecx, byte 0x03
-	inc	ecx
-	shl	ecx, 4
-	movd	xmmF,ecx
-	psrlq	xmmE,xmmF
-	punpcklbw xmmE,xmmE
-	; ----------------
-	mov	ecx,edi
-	and	ecx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmB,xmmA
-	movdqa	xmmG,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmC,ecx
-	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmH,ecx
-	psllq	xmmA,xmmH
-	psllq	xmmE,xmmH
-	jmp	short .adj0
-.adj1:	neg	ecx
-	movd	xmmH,ecx
-	psrlq	xmmA,xmmH
-	psrlq	xmmE,xmmH
-	psllq	xmmB,xmmC
-	psllq	xmmG,xmmC
-	por	xmmA,xmmB
-	por	xmmE,xmmG
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 

diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm
index 3bf4148..55cdbda 100644
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm

@@ -1,7 +1,7 @@
 ;
 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
-; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB
 ; Copyright 2009 D. R. Commander
 ;
 ; Based on
@@ -12,7 +12,7 @@
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; [TAB8]
@@ -252,17 +252,13 @@
 	movntdq	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [rdi], xmmF
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -275,26 +271,23 @@
 	jmp	near .columnloop
 
 .column_st32:
-	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
 	lea	rcx, [rcx+rcx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	rcx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	rcx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [rdi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store the lower 8 bytes of xmmA to the output when it has enough
 	; space.
 	cmp	rcx, byte SIZEOF_MMWORD
@@ -328,47 +321,6 @@
 	test	rcx, rcx
 	jz	short .endcolumn
 	mov	BYTE [rdi], al
-%else
-	mov	rax,rcx
-	xor	rcx, byte 0x0F
-	shl	rcx, 2
-	movd	xmmB,ecx
-	psrlq	xmmH,4
-	pcmpeqb	xmmE,xmmE
-	psrlq	xmmH,xmmB
-	psrlq	xmmE,xmmB
-	punpcklbw xmmE,xmmH
-	; ----------------
-	mov	rcx,rdi
-	and	rcx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	add	rax,rcx
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmG,xmmA
-	movdqa	xmmC,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmD,ecx
-	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmF,ecx
-	psllq	xmmA,xmmF
-	psllq	xmmE,xmmF
-	jmp	short .adj0
-.adj1:	neg	rcx
-	movd	xmmF,ecx
-	psrlq	xmmA,xmmF
-	psrlq	xmmE,xmmF
-	psllq	xmmG,xmmD
-	psllq	xmmC,xmmD
-	por	xmmA,xmmG
-	por	xmmE,xmmC
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -413,19 +365,14 @@
 	movntdq	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [rdi], xmmC
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [rdi], xmmH
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	rcx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -438,25 +385,22 @@
 	jmp	near .columnloop
 
 .column_st32:
-	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	rcx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [rdi], xmmA
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [rdi], xmmD
-	add	rdi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	rcx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
 	add	rdi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store two pixels (8 bytes) of xmmA to the output when it has enough
 	; space.
 	cmp	rcx, byte SIZEOF_XMMWORD/8
@@ -471,47 +415,6 @@
 	test	rcx, rcx
 	jz	short .endcolumn
 	movd	DWORD [rdi], xmmA
-%else
-	cmp	rcx, byte SIZEOF_XMMWORD/16
-	jb	near .endcolumn
-	mov	rax,rcx
-	xor	rcx, byte 0x03
-	inc	rcx
-	shl	rcx, 4
-	movd	xmmF,ecx
-	psrlq	xmmE,xmmF
-	punpcklbw xmmE,xmmE
-	; ----------------
-	mov	rcx,rdi
-	and	rcx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	lea	rax, [rcx+rax*4]	; RGB_PIXELSIZE
-	cmp	rax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	rdi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	rcx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmB,xmmA
-	movdqa	xmmG,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmC,ecx
-	sub	rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmH,ecx
-	psllq	xmmA,xmmH
-	psllq	xmmE,xmmH
-	jmp	short .adj0
-.adj1:	neg	rcx
-	movd	xmmH,ecx
-	psrlq	xmmA,xmmH
-	psrlq	xmmE,xmmH
-	psllq	xmmB,xmmC
-	psllq	xmmG,xmmC
-	por	xmmA,xmmB
-	por	xmmE,xmmG
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 

diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
index f0fcc2c..575f300 100644
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm

@@ -1,7 +1,7 @@
 ;
 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
 ;
-; Copyright 2009 Pierre Ossman <[email protected]> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <[email protected]> for Cendio AB
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -264,17 +264,13 @@
 	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -288,26 +284,23 @@
 	alignx	16,7
 
 .column_st32:
-	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
 	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
 	cmp	ecx, byte 2*SIZEOF_XMMWORD
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmF
 	sub	ecx, byte 2*SIZEOF_XMMWORD
 	jmp	short .column_st15
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store the lower 8 bytes of xmmA to the output when it has enough
 	; space.
 	cmp	ecx, byte SIZEOF_MMWORD
@@ -341,47 +334,6 @@
 	test	ecx, ecx
 	jz	short .endcolumn
 	mov	BYTE [edi], al
-%else
-	mov	eax,ecx
-	xor	ecx, byte 0x0F
-	shl	ecx, 2
-	movd	xmmB,ecx
-	psrlq	xmmH,4
-	pcmpeqb	xmmE,xmmE
-	psrlq	xmmH,xmmB
-	psrlq	xmmE,xmmB
-	punpcklbw xmmE,xmmH
-	; ----------------
-	mov	ecx,edi
-	and	ecx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	add	eax,ecx
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmG,xmmA
-	movdqa	xmmC,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmD,ecx
-	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmF,ecx
-	psllq	xmmA,xmmF
-	psllq	xmmE,xmmF
-	jmp	short .adj0
-.adj1:	neg	ecx
-	movd	xmmF,ecx
-	psrlq	xmmA,xmmF
-	psrlq	xmmE,xmmF
-	psllq	xmmG,xmmD
-	psllq	xmmC,xmmD
-	por	xmmA,xmmG
-	por	xmmE,xmmC
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -426,19 +378,14 @@
 	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
 	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
 	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	jmp	short .out0
 .out1:	; --(unaligned)-----------------
-	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movdqu	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movdqu	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
 	sub	ecx, byte SIZEOF_XMMWORD
 	jz	near .endcolumn
 
@@ -452,80 +399,36 @@
 	alignx	16,7
 
 .column_st32:
-	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
 	cmp	ecx, byte SIZEOF_XMMWORD/2
 	jb	short .column_st16
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD	; outptr
-	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
-	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movdqu	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmC
 	movdqa	xmmD,xmmH
 	sub	ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
 	cmp	ecx, byte SIZEOF_XMMWORD/4
 	jb	short .column_st15
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	movdqu	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
 	add	edi, byte SIZEOF_XMMWORD	; outptr
 	movdqa	xmmA,xmmD
 	sub	ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
 	; Store two pixels (8 bytes) of xmmA to the output when it has enough
 	; space.
 	cmp	ecx, byte SIZEOF_XMMWORD/8
 	jb	short .column_st7
 	movq	MMWORD [edi], xmmA
-	add	edi, byte SIZEOF_XMMWORD/2
+	add	edi, byte SIZEOF_XMMWORD/8*4
 	sub	ecx, byte SIZEOF_XMMWORD/8
-	psrldq	xmmA, 64
+	psrldq	xmmA, SIZEOF_XMMWORD/8*4
 .column_st7:
 	; Store one pixel (4 bytes) of xmmA to the output when it has enough
 	; space.
 	test	ecx, ecx
 	jz	short .endcolumn
 	movd	DWORD [edi], xmmA
-%else
-	cmp	ecx, byte SIZEOF_XMMWORD/16
-	jb	short .endcolumn
-	mov	eax,ecx
-	xor	ecx, byte 0x03
-	inc	ecx
-	shl	ecx, 4
-	movd	xmmF,ecx
-	psrlq	xmmE,xmmF
-	punpcklbw xmmE,xmmE
-	; ----------------
-	mov	ecx,edi
-	and	ecx, byte SIZEOF_XMMWORD-1
-	jz	short .adj0
-	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
-	cmp	eax, byte SIZEOF_XMMWORD
-	ja	short .adj0
-	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
-	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
-	movdqa	xmmB,xmmA
-	movdqa	xmmG,xmmE
-	pslldq	xmmA, SIZEOF_XMMWORD/2
-	pslldq	xmmE, SIZEOF_XMMWORD/2
-	movd	xmmC,ecx
-	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-	jb	short .adj1
-	movd	xmmH,ecx
-	psllq	xmmA,xmmH
-	psllq	xmmE,xmmH
-	jmp	short .adj0
-.adj1:	neg	ecx
-	movd	xmmH,ecx
-	psrlq	xmmA,xmmH
-	psrlq	xmmE,xmmH
-	psllq	xmmB,xmmC
-	psllq	xmmG,xmmC
-	por	xmmA,xmmB
-	por	xmmE,xmmG
-.adj0:	; ----------------
-	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %endif ; RGB_PIXELSIZE ; ---------------
 

diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
index 0dacd06..583b7e3 100644
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h

@@ -15,54 +15,51 @@
 #include "../jmorecfg.h"
 #include "jsimd.h"
 
-#define define(var) %define _cpp_protection_##var
-#define definev(var) %define _cpp_protection_##var var
-
 ;
 ; -- jpeglib.h
 ;
 
-definev(DCTSIZE)
-definev(DCTSIZE2)
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
 
 ;
 ; -- jmorecfg.h
 ;
 
-definev(RGB_RED)
-definev(RGB_GREEN)
-definev(RGB_BLUE)
-definev(RGB_PIXELSIZE)
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
 
-definev(EXT_RGB_RED)
-definev(EXT_RGB_GREEN)
-definev(EXT_RGB_BLUE)
-definev(EXT_RGB_PIXELSIZE)
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 
-definev(EXT_RGBX_RED)
-definev(EXT_RGBX_GREEN)
-definev(EXT_RGBX_BLUE)
-definev(EXT_RGBX_PIXELSIZE)
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
 
-definev(EXT_BGR_RED)
-definev(EXT_BGR_GREEN)
-definev(EXT_BGR_BLUE)
-definev(EXT_BGR_PIXELSIZE)
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
 
-definev(EXT_BGRX_RED)
-definev(EXT_BGRX_GREEN)
-definev(EXT_BGRX_BLUE)
-definev(EXT_BGRX_PIXELSIZE)
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
 
-definev(EXT_XBGR_RED)
-definev(EXT_XBGR_GREEN)
-definev(EXT_XBGR_BLUE)
-definev(EXT_XBGR_PIXELSIZE)
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
 
-definev(EXT_XRGB_RED)
-definev(EXT_XRGB_GREEN)
-definev(EXT_XRGB_BLUE)
-definev(EXT_XRGB_PIXELSIZE)
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 
 %define RGBX_FILLER_0XFF        1
 
@@ -73,7 +70,7 @@
 %define JSAMPLE                 byte          ; unsigned char
 %define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
 
-definev(CENTERJSAMPLE)
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
 
 ; Representation of a DCT frequency coefficient.
 ; On this SIMD implementation, this must be 'short'.
@@ -126,74 +123,74 @@
 ; -- jsimd.h
 ;
 
-definev(JSIMD_NONE)
-definev(JSIMD_MMX)
-definev(JSIMD_3DNOW)
-definev(JSIMD_SSE)
-definev(JSIMD_SSE2)
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
 
 ; Short forms of external names for systems with brain-damaged linkers.
 ;
 #ifdef NEED_SHORT_EXTERNAL_NAMES
-definev(jpeg_simd_cpu_support)
-definev(jsimd_rgb_ycc_convert_mmx)
-definev(jsimd_ycc_rgb_convert_mmx)
-definev(jconst_rgb_ycc_convert_sse2)
-definev(jsimd_rgb_ycc_convert_sse2)
-definev(jconst_ycc_rgb_convert_sse2)
-definev(jsimd_ycc_rgb_convert_sse2)
-definev(jsimd_h2v2_downsample_mmx)
-definev(jsimd_h2v1_downsample_mmx)
-definev(jsimd_h2v2_downsample_sse2)
-definev(jsimd_h2v1_downsample_sse2)
-definev(jsimd_h2v2_upsample_mmx)
-definev(jsimd_h2v1_upsample_mmx)
-definev(jsimd_h2v1_fancy_upsample_mmx)
-definev(jsimd_h2v2_fancy_upsample_mmx)
-definev(jsimd_h2v1_merged_upsample_mmx)
-definev(jsimd_h2v2_merged_upsample_mmx)
-definev(jsimd_h2v2_upsample_sse2)
-definev(jsimd_h2v1_upsample_sse2)
-definev(jconst_fancy_upsample_sse2)
-definev(jsimd_h2v1_fancy_upsample_sse2)
-definev(jsimd_h2v2_fancy_upsample_sse2)
-definev(jconst_merged_upsample_sse2)
-definev(jsimd_h2v1_merged_upsample_sse2)
-definev(jsimd_h2v2_merged_upsample_sse2)
-definev(jsimd_convsamp_mmx)
-definev(jsimd_convsamp_sse2)
-definev(jsimd_convsamp_float_3dnow)
-definev(jsimd_convsamp_float_sse)
-definev(jsimd_convsamp_float_sse2)
-definev(jsimd_fdct_islow_mmx)
-definev(jsimd_fdct_ifast_mmx)
-definev(jconst_fdct_islow_sse2)
-definev(jsimd_fdct_islow_sse2)
-definev(jconst_fdct_ifast_sse2)
-definev(jsimd_fdct_ifast_sse2)
-definev(jsimd_fdct_float_3dnow)
-definev(jconst_fdct_float_sse)
-definev(jsimd_fdct_float_sse)
-definev(jsimd_quantize_mmx)
-definev(jsimd_quantize_sse2)
-definev(jsimd_quantize_float_3dnow)
-definev(jsimd_quantize_float_sse)
-definev(jsimd_quantize_float_sse2)
-definev(jsimd_idct_2x2_mmx)
-definev(jsimd_idct_4x4_mmx)
-definev(jconst_idct_red_sse2)
-definev(jsimd_idct_2x2_sse2)
-definev(jsimd_idct_4x4_sse2)
-definev(jsimd_idct_islow_mmx)
-definev(jsimd_idct_ifast_mmx)
-definev(jconst_idct_islow_sse2)
-definev(jsimd_idct_islow_sse2)
-definev(jconst_idct_ifast_sse2)
-definev(jsimd_idct_ifast_sse2)
-definev(jsimd_idct_float_3dnow)
-definev(jconst_idct_float_sse)
-definev(jsimd_idct_float_sse)
-definev(jconst_idct_float_sse2)
-definev(jsimd_idct_float_sse2)
+%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
+%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
+%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
+%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
+%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
+%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
+%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
+%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
+%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
+%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
+%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
+%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
+%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
+%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
+%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
+%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
+%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
+%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
+%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
+%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
+%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
+%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
+%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
+%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
+%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
+%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
+%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
+%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
+%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
+%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
+%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
+%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
+%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
+%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
+%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
+%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
+%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
+%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
+%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
+%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
+%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
+%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
+%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 

diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
index 3a3092e..abb6863 100644
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc

@@ -89,8 +89,6 @@
 %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
 %endif
 
-%define STRICT_MEMORY_ACCESS 1
-
 ; To make the code position-independent, append -DPIC to the commandline
 ;
 %define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC

diff --git a/tjbench.c b/tjbench.c
index e529ffc..87b462f 100644
--- a/tjbench.c
+++ b/tjbench.c

@@ -667,7 +667,7 @@
 {
 	int i;
 	printf("USAGE: %s\n", progname);
-	printf("       <Inputfile (BMP|PPM)> <%% Quality> [options]\n\n");
+	printf("       <Inputfile (BMP|PPM)> <Quality> [options]\n\n");
 	printf("       %s\n", progname);
 	printf("       <Inputfile (JPG)> [options]\n\n");
 	printf("Options:\n\n");
@@ -680,8 +680,12 @@
 	printf("     Force MMX, SSE, SSE2, or SSE3 code paths in the underlying codec\n");
 	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
 	printf("     Test the specified color conversion path in the codec (default: BGR)\n");
-	printf("-fastupsample = Use fast, inaccurate upsampling code to perform 4:2:2 and 4:2:0\n");
-	printf("     YUV decoding\n");
+	printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
+	printf("     the underlying codec\n");
+	printf("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying\n");
+	printf("     codec\n");
+	printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
+	printf("     underlying codec\n");
 	printf("-quiet = Output results in tabular rather than verbose format\n");
 	printf("-yuvencode = Encode RGB input as planar YUV rather than compressing as JPEG\n");
 	printf("-yuvdecode = Decode JPEG image to planar YUV rather than RGB\n");
@@ -796,6 +800,16 @@
 				printf("Using fast upsampling code\n\n");
 				flags|=TJFLAG_FASTUPSAMPLE;
 			}
+			if(!strcasecmp(argv[i], "-fastdct"))
+			{
+				printf("Using fastest DCT/IDCT algorithm\n\n");
+				flags|=TJFLAG_FASTDCT;
+			}
+			if(!strcasecmp(argv[i], "-accuratedct"))
+			{
+				printf("Using most accurate DCT/IDCT algorithm\n\n");
+				flags|=TJFLAG_ACCURATEDCT;
+			}
 			if(!strcasecmp(argv[i], "-rgb")) pf=TJPF_RGB;
 			if(!strcasecmp(argv[i], "-rgbx")) pf=TJPF_RGBX;
 			if(!strcasecmp(argv[i], "-bgr")) pf=TJPF_BGR;

diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index 1ff9bba..c98845b 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2012 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -350,12 +350,12 @@
 	return;
 }
 
-JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint width, jint pitch, jint height, jint pf, jint flags)
+		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
 {
 	tjhandle handle=0;
-	jsize arraySize=0;
+	jsize arraySize=0, actualPitch;
 	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
 
 	gethandle();
@@ -367,15 +367,68 @@
 
 	if((*env)->GetArrayLength(env, src)<jpegSize)
 		_throw("Source buffer is not large enough");
-	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
+	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
+	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
 	if((*env)->GetArrayLength(env, dst)<arraySize)
 		_throw("Destination buffer is not large enough");
 
 	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
 	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize, dstBuf, width,
-		pitch, height, pf, flags)==-1)
+	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
+		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
+		flags)==-1)
+	{
+		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+		dstBuf=jpegBuf=NULL;
+		_throw(tjGetErrorStr());
+	}
+
+	bailout:
+	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+	return;
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+		jint width, jint pitch, jint height, jint pf, jint flags)
+{
+	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
+		(env, obj, src, jpegSize, dst, 0, 0, width, pitch, height, pf, flags);
+}
+
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
+	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+		jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
+{
+	tjhandle handle=0;
+	jsize arraySize=0, actualStride;
+	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
+
+	gethandle();
+
+	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
+		_throw("Invalid argument in decompress()");
+	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
+		_throw("Mismatch between Java and C API");
+	if(tjPixelSize[pf]!=sizeof(jint))
+		_throw("Pixel format must be 32-bit when decompressing to an integer buffer.");
+
+	if((*env)->GetArrayLength(env, src)<jpegSize)
+		_throw("Source buffer is not large enough");
+	actualStride=(stride==0)? width:stride;
+	arraySize=(y+height-1)*actualStride + x+width;
+	if((*env)->GetArrayLength(env, dst)<arraySize)
+		_throw("Destination buffer is not large enough");
+
+	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
+		&dstBuf[(y*actualStride + x)*sizeof(int)], width, stride*sizeof(jint),
+		height, pf, flags)==-1)
 	{
 		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
 		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
@@ -391,43 +444,11 @@
 
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII
 	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
-		jint width, jint pitch, jint height, jint pf, jint flags)
+		jint width, jint stride, jint height, jint pf, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0;
-	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
-
-	gethandle();
-
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throw("Invalid argument in decompress()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throw("Mismatch between Java and C API");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throw("Pixel format must be 32-bit when decompressing to an integer buffer.");
-
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throw("Source buffer is not large enough");
-	arraySize=(pitch==0)? width*height:pitch*height;
-	if((*env)->GetArrayLength(env, dst)<arraySize)
-		_throw("Destination buffer is not large enough");
-
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
-
-	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize, dstBuf, width,
-		pitch*sizeof(jint), height, pf, flags)==-1)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-		(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-		dstBuf=jpegBuf=NULL;
-		_throw(tjGetErrorStr());
-	}
-
-	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-	return;
+	Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
+		(env, obj, src, jpegSize, dst, 0, 0, width, stride, height, pf, flags);
+	
 }
 
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV

diff --git a/turbojpeg.c b/turbojpeg.c
index c875fd9..21599c8 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c

@@ -145,7 +145,7 @@
 }
 
 static int setCompDefaults(struct jpeg_compress_struct *cinfo,
-	int pixelFormat, int subsamp, int jpegQual)
+	int pixelFormat, int subsamp, int jpegQual, int flags)
 {
 	int retval=0;
 
@@ -191,7 +191,7 @@
 	if(jpegQual>=0)
 	{
 		jpeg_set_quality(cinfo, jpegQual, TRUE);
-		if(jpegQual>=96) cinfo->dct_method=JDCT_ISLOW;
+		if(jpegQual>=96 || flags&TJFLAG_ACCURATEDCT) cinfo->dct_method=JDCT_ISLOW;
 		else cinfo->dct_method=JDCT_FASTEST;
 	}
 	if(subsamp==TJSAMP_GRAY)
@@ -210,7 +210,7 @@
 }
 
 static int setDecompDefaults(struct jpeg_decompress_struct *dinfo,
-	int pixelFormat)
+	int pixelFormat, int flags)
 {
 	int retval=0;
 
@@ -258,6 +258,8 @@
 			_throw("Unsupported pixel format");
 	}
 
+	if(flags&TJFLAG_FASTDCT) dinfo->dct_method=JDCT_FASTEST;
+
 	bailout:
 	return retval;
 }
@@ -618,7 +620,7 @@
 		alloc=0;  *jpegSize=tjBufSize(width, height, jpegSubsamp);
 	}
 	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
-	if(setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual)==-1)
+	if(setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual, flags)==-1)
 		return -1;
 
 	jpeg_start_compress(cinfo, TRUE);
@@ -726,7 +728,7 @@
 
 	yuvsize=tjBufSizeYUV(width, height, subsamp);
 	jpeg_mem_dest_tj(cinfo, &dstBuf, &yuvsize, 0);
-	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1)==-1) return -1;
+	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
 
 	jpeg_start_compress(cinfo, TRUE);
 	pw=PAD(width, cinfo->max_h_samp_factor);
@@ -955,7 +957,7 @@
 
 	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 	jpeg_read_header(dinfo, TRUE);
-	if(setDecompDefaults(dinfo, pixelFormat)==-1)
+	if(setDecompDefaults(dinfo, pixelFormat, flags)==-1)
 	{
 		retval=-1;  goto bailout;
 	}
@@ -1110,6 +1112,7 @@
 	}
 
 	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
+	if(flags&TJFLAG_FASTDCT) dinfo->dct_method=JDCT_FASTEST;
 	dinfo->raw_data_out=TRUE;
 
 	jpeg_start_decompress(dinfo);

diff --git a/turbojpeg.h b/turbojpeg.h
index 343788a..7610221 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2011 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2012 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -226,28 +226,30 @@
  */
 #define TJFLAG_BOTTOMUP        2
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use MMX code (IPP and
- * 32-bit libjpeg-turbo versions only.)
+ * Turn off CPU auto-detection and force TurboJPEG to use MMX code (if the
+ * underlying codec supports it.)
  */
 #define TJFLAG_FORCEMMX        8
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE code (32-bit IPP
- * and 32-bit libjpeg-turbo versions only)
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE code (if the
+ * underlying codec supports it.)
  */
 #define TJFLAG_FORCESSE       16
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (32-bit IPP
- * and 32-bit libjpeg-turbo versions only)
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (if the
+ * underlying codec supports it.)
  */
 #define TJFLAG_FORCESSE2      32
 /**
- * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (64-bit IPP
- * version only)
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (if the
+ * underlying codec supports it.)
  */
 #define TJFLAG_FORCESSE3     128
 /**
- * Use fast, inaccurate chrominance upsampling routines in the JPEG
- * decompressor (libjpeg and libjpeg-turbo versions only)
+ * When decompressing, use the fastest chrominance upsampling algorithm
+ * available in the underlying codec.  The default is to use smooth upsampling,
+ * which creates a smooth transition between neighboring chrominance components
+ * in order to reduce upsampling artifacts in the decompressed image.
  */
 #define TJFLAG_FASTUPSAMPLE  256
 /**
@@ -258,6 +260,24 @@
  * versions of TurboJPEG.
  */
 #define TJFLAG_NOREALLOC     1024
+/**
+ * Use the fastest DCT/IDCT algorithm available in the underlying codec.  The
+ * default if this flag is not specified is implementation-specific.  The
+ * libjpeg implementation, for example, uses the fast algorithm by default when
+ * compressing, because this has been shown to have only a very slight effect
+ * on accuracy, but it uses the accurate algorithm when decompressing, because
+ * this has been shown to have a larger effect.
+ */
+#define TJFLAG_FASTDCT       2048
+/**
+ * Use the most accurate DCT/IDCT algorithm available in the underlying codec.
+ * The default if this flag is not specified is implementation-specific.  The
+ * libjpeg implementation, for example, uses the fast algorithm by default when
+ * compressing, because this has been shown to have only a very slight effect
+ * on accuracy, but it uses the accurate algorithm when decompressing, because
+ * this has been shown to have a larger effect.
+ */
+#define TJFLAG_ACCURATEDCT   4096
 
 
 /**
commit	11e6ee95ca9a40fe6b86a1cd23a9fbfd7d19c2bd	[log] [tgz]
author	[email protected] <[email protected]@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>	Thu Jul 19 06:04:44 2012 +0000
committer	[email protected] <[email protected]@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>	Thu Jul 19 06:04:44 2012 +0000
tree	2eec0543260b716b3862c8c05100bcd7ab6f833c
parent	cd3e30f64064274b17a99bb93e20a7dad2703bf0 [diff]