__set_neon_cumulative_sat() modifies the contents on the QC flag, and
some intrinsics do so too: this patch adds the explicit dependency on
the asm statement, to avoid code reordering or removal.

When writing QC, the asm statement now has a fake input dependency,
which is the output of the intrinsic being tested. Modifying the
__set_neon_cumulative_sat macro is necessary, to be able to accept all
the possible input types.

Update the generic code in ref_v_binary_sat_op.c and ref_v_unary_sat_op.c
accordingly, as well as all the tests involving QC.
diff --git a/ref_vqshl.c b/ref_vqshl.c
index 84ca9a0..a9d29d7 100644
--- a/ref_vqshl.c
+++ b/ref_vqshl.c
@@ -40,15 +40,15 @@
 FNNAME (INSN)
 {
   /* Basic test: v3=vqshl(v1,v2), then store the result.  */
-#define TEST_VQSHL2(INSN, T3, Q, T1, T2, W, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T1, W, N) =				\
-    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),		\
-		      VECT_VAR(vector_shift, T3, W, N));	\
-  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
-		    VECT_VAR(vector_res, T1, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),	\
-			   xSTR(T1), W, N)
+#define TEST_VQSHL2(INSN, T3, Q, T1, T2, W, N)				\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),			\
+		      VECT_VAR(vector_shift, T3, W, N));		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		      VECT_VAR(vector_res, T1, W, N));			\
+  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),		\
+			       xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
 #define TEST_VQSHL1(INSN, T3, Q, T1, T2, W, N)	\