Update frameworks/rs for LLVM rebase to r233350

- Update 'load' to the new syntax in the textual IR.  These files were
  updated automatically using the script in
  http://reviews.llvm.org/D7649
- Update 'getelementpointer' instructions to the new GEP.  arch/*.ll
  were updated automatically using the script in
  http://reviews.llvm.org/D7636

Change-Id: Id198429579f22277108d5d55c34d4870359a6c18
diff --git a/driver/runtime/ll64/allocation.ll b/driver/runtime/ll64/allocation.ll
index 14f8e26..2c828ca 100644
--- a/driver/runtime/ll64/allocation.ll
+++ b/driver/runtime/ll64/allocation.ll
@@ -38,7 +38,7 @@
 
 define signext i8 @rsGetElementAtImpl_char(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 1, i32 %x, i32 %y, i32 %z) #2
-  %2 = load i8* %1, align 1, !tbaa !21
+  %2 = load i8, i8* %1, align 1, !tbaa !21
   ret i8 %2
 }
 
@@ -54,7 +54,7 @@
 define <2 x i8> @rsGetElementAtImpl_char2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i8>*
-  %3 = load <2 x i8>* %2, align 2, !tbaa !22
+  %3 = load <2 x i8>, <2 x i8>* %2, align 2, !tbaa !22
   ret <2 x i8> %3
 }
 
@@ -71,7 +71,7 @@
 define <3 x i8> @rsGetElementAtImpl_char3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 4, !tbaa !23
+  %3 = load <4 x i8>, <4 x i8>* %2, align 4, !tbaa !23
   %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i8> %4
 }
@@ -87,7 +87,7 @@
 define <4 x i8> @rsGetElementAtImpl_char4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 4, !tbaa !24
+  %3 = load <4 x i8>, <4 x i8>* %2, align 4, !tbaa !24
   ret <4 x i8> %3
 }
 
@@ -100,7 +100,7 @@
 
 define zeroext i8 @rsGetElementAtImpl_uchar(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 1, i32 %x, i32 %y, i32 %z) #2
-  %2 = load i8* %1, align 1, !tbaa !25
+  %2 = load i8, i8* %1, align 1, !tbaa !25
   ret i8 %2
 }
 
@@ -116,7 +116,7 @@
 define <2 x i8> @rsGetElementAtImpl_uchar2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i8>*
-  %3 = load <2 x i8>* %2, align 2, !tbaa !26
+  %3 = load <2 x i8>, <2 x i8>* %2, align 2, !tbaa !26
   ret <2 x i8> %3
 }
 
@@ -133,7 +133,7 @@
 define <3 x i8> @rsGetElementAtImpl_uchar3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 4, !tbaa !27
+  %3 = load <4 x i8>, <4 x i8>* %2, align 4, !tbaa !27
   %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i8> %4
 }
@@ -149,7 +149,7 @@
 define <4 x i8> @rsGetElementAtImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 4, !tbaa !28
+  %3 = load <4 x i8>, <4 x i8>* %2, align 4, !tbaa !28
   ret <4 x i8> %3
 }
 
@@ -164,7 +164,7 @@
 define signext i16 @rsGetElementAtImpl_short(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i16*
-  %3 = load i16* %2, align 2, !tbaa !29
+  %3 = load i16, i16* %2, align 2, !tbaa !29
   ret i16 %3
 }
 
@@ -179,7 +179,7 @@
 define <2 x i16> @rsGetElementAtImpl_short2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  %3 = load <2 x i16>* %2, align 4, !tbaa !30
+  %3 = load <2 x i16>, <2 x i16>* %2, align 4, !tbaa !30
   ret <2 x i16> %3
 }
 
@@ -196,7 +196,7 @@
 define <3 x i16> @rsGetElementAtImpl_short3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 8, !tbaa !31
+  %3 = load <4 x i16>, <4 x i16>* %2, align 8, !tbaa !31
   %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i16> %4
 }
@@ -212,7 +212,7 @@
 define <4 x i16> @rsGetElementAtImpl_short4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 8, !tbaa !32
+  %3 = load <4 x i16>, <4 x i16>* %2, align 8, !tbaa !32
   ret <4 x i16> %3
 }
 
@@ -227,7 +227,7 @@
 define zeroext i16 @rsGetElementAtImpl_ushort(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i16*
-  %3 = load i16* %2, align 2, !tbaa !33
+  %3 = load i16, i16* %2, align 2, !tbaa !33
   ret i16 %3
 }
 
@@ -242,7 +242,7 @@
 define <2 x i16> @rsGetElementAtImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  %3 = load <2 x i16>* %2, align 4, !tbaa !34
+  %3 = load <2 x i16>, <2 x i16>* %2, align 4, !tbaa !34
   ret <2 x i16> %3
 }
 
@@ -259,7 +259,7 @@
 define <3 x i16> @rsGetElementAtImpl_ushort3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 8, !tbaa !35
+  %3 = load <4 x i16>, <4 x i16>* %2, align 8, !tbaa !35
   %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i16> %4
 }
@@ -275,7 +275,7 @@
 define <4 x i16> @rsGetElementAtImpl_ushort4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 8, !tbaa !36
+  %3 = load <4 x i16>, <4 x i16>* %2, align 8, !tbaa !36
   ret <4 x i16> %3
 }
 
@@ -290,7 +290,7 @@
 define i32 @rsGetElementAtImpl_int(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i32*
-  %3 = load i32* %2, align 4, !tbaa !37
+  %3 = load i32, i32* %2, align 4, !tbaa !37
   ret i32 %3
 }
 
@@ -305,7 +305,7 @@
 define <2 x i32> @rsGetElementAtImpl_int2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i32>*
-  %3 = load <2 x i32>* %2, align 8, !tbaa !38
+  %3 = load <2 x i32>, <2 x i32>* %2, align 8, !tbaa !38
   ret <2 x i32> %3
 }
 
@@ -321,7 +321,7 @@
 define <3 x i32> @rsGetElementAtImpl_int3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 8, !tbaa !39
+  %3 = load <4 x i32>, <4 x i32>* %2, align 8, !tbaa !39
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i32> %4
 }
@@ -337,7 +337,7 @@
 define <4 x i32> @rsGetElementAtImpl_int4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 16, !tbaa !40
+  %3 = load <4 x i32>, <4 x i32>* %2, align 16, !tbaa !40
   ret <4 x i32> %3
 }
 
@@ -352,7 +352,7 @@
 define i32 @rsGetElementAtImpl_uint(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i32*
-  %3 = load i32* %2, align 4, !tbaa !41
+  %3 = load i32, i32* %2, align 4, !tbaa !41
   ret i32 %3
 }
 
@@ -367,7 +367,7 @@
 define <2 x i32> @rsGetElementAtImpl_uint2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i32>*
-  %3 = load <2 x i32>* %2, align 8, !tbaa !42
+  %3 = load <2 x i32>, <2 x i32>* %2, align 8, !tbaa !42
   ret <2 x i32> %3
 }
 
@@ -383,7 +383,7 @@
 define <3 x i32> @rsGetElementAtImpl_uint3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 8, !tbaa !43
+  %3 = load <4 x i32>, <4 x i32>* %2, align 8, !tbaa !43
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x i32> %4
 }
@@ -399,7 +399,7 @@
 define <4 x i32> @rsGetElementAtImpl_uint4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 16, !tbaa !44
+  %3 = load <4 x i32>, <4 x i32>* %2, align 16, !tbaa !44
   ret <4 x i32> %3
 }
 
@@ -414,7 +414,7 @@
 define i64 @rsGetElementAtImpl_long(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i64*
-  %3 = load i64* %2, align 8, !tbaa !45
+  %3 = load i64, i64* %2, align 8, !tbaa !45
   ret i64 %3
 }
 
@@ -429,14 +429,14 @@
 define <2 x i64> @rsGetElementAtImpl_long2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i64>*
-  %3 = load <2 x i64>* %2, align 16, !tbaa !46
+  %3 = load <2 x i64>, <2 x i64>* %2, align 16, !tbaa !46
   ret <2 x i64> %3
 }
 
 !47 = !{!"long3", !15}
 define void @rsSetElementAtImpl_long3(%struct.rs_allocation* nocapture readonly %a, <3 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x i64>* %val
+  %2 = load <3 x i64>, <3 x i64>* %val
   %3 = shufflevector <3 x i64> %2, <3 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   %4 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !47
@@ -446,7 +446,7 @@
 define void @rsGetElementAtImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 32
+  %3 = load <4 x i64>, <4 x i64>* %2, align 32
   %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
   store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !47
   ret void
@@ -455,7 +455,7 @@
 !48 = !{!"long4", !15}
 define void @rsSetElementAtImpl_long4(%struct.rs_allocation* nocapture readonly %a, <4 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x i64>* %val
+  %2 = load <4 x i64>, <4 x i64>* %val
   %3 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %2, <4 x i64>* %3, align 32, !tbaa !48
   ret void
@@ -464,7 +464,7 @@
 define void @rsGetElementAtImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 32, !tbaa !15
+  %3 = load <4 x i64>, <4 x i64>* %2, align 32, !tbaa !15
   store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !48
   ret void
 }
@@ -480,7 +480,7 @@
 define i64 @rsGetElementAtImpl_ulong(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to i64*
-  %3 = load i64* %2, align 8, !tbaa !49
+  %3 = load i64, i64* %2, align 8, !tbaa !49
   ret i64 %3
 }
 
@@ -495,14 +495,14 @@
 define <2 x i64> @rsGetElementAtImpl_ulong2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i64>*
-  %3 = load <2 x i64>* %2, align 16, !tbaa !50
+  %3 = load <2 x i64>, <2 x i64>* %2, align 16, !tbaa !50
   ret <2 x i64> %3
 }
 
 !51 = !{!"ulong3", !15}
 define void @rsSetElementAtImpl_ulong3(%struct.rs_allocation* nocapture readonly %a, <3 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x i64>* %val
+  %2 = load <3 x i64>, <3 x i64>* %val
   %3 = shufflevector <3 x i64> %2, <3 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   %4 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !47
@@ -512,7 +512,7 @@
 define void @rsGetElementAtImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 32
+  %3 = load <4 x i64>, <4 x i64>* %2, align 32
   %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
   store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !51
   ret void
@@ -521,7 +521,7 @@
 !52 = !{!"ulong4", !15}
 define void @rsSetElementAtImpl_ulong4(%struct.rs_allocation* nocapture readonly %a, <4 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x i64>* %val
+  %2 = load <4 x i64>, <4 x i64>* %val
   %3 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %2, <4 x i64>* %3, align 32, !tbaa !52
   ret void
@@ -530,7 +530,7 @@
 define void @rsGetElementAtImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 32, !tbaa !15
+  %3 = load <4 x i64>, <4 x i64>* %2, align 32, !tbaa !15
   store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !52
   ret void
 }
@@ -546,7 +546,7 @@
 define float @rsGetElementAtImpl_float(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to float*
-  %3 = load float* %2, align 4, !tbaa !53
+  %3 = load float, float* %2, align 4, !tbaa !53
   ret float %3
 }
 
@@ -561,7 +561,7 @@
 define <2 x float> @rsGetElementAtImpl_float2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x float>*
-  %3 = load <2 x float>* %2, align 8, !tbaa !54
+  %3 = load <2 x float>, <2 x float>* %2, align 8, !tbaa !54
   ret <2 x float> %3
 }
 
@@ -578,7 +578,7 @@
 define <3 x float> @rsGetElementAtImpl_float3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x float>*
-  %3 = load <4 x float>* %2, align 8, !tbaa !55
+  %3 = load <4 x float>, <4 x float>* %2, align 8, !tbaa !55
   %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %4
 }
@@ -594,7 +594,7 @@
 define <4 x float> @rsGetElementAtImpl_float4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x float>*
-  %3 = load <4 x float>* %2, align 16, !tbaa !56
+  %3 = load <4 x float>, <4 x float>* %2, align 16, !tbaa !56
   ret <4 x float> %3
 }
 
@@ -609,7 +609,7 @@
 define double @rsGetElementAtImpl_double(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 8, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to double*
-  %3 = load double* %2, align 8, !tbaa !57
+  %3 = load double, double* %2, align 8, !tbaa !57
   ret double %3
 }
 
@@ -624,14 +624,14 @@
 define <2 x double> @rsGetElementAtImpl_double2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 16, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x double>*
-  %3 = load <2 x double>* %2, align 16, !tbaa !58
+  %3 = load <2 x double>, <2 x double>* %2, align 16, !tbaa !58
   ret <2 x double> %3
 }
 
 !59 = !{!"double3", !15}
 define void @rsSetElementAtImpl_double3(%struct.rs_allocation* nocapture readonly %a, <3 x double>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x double>* %val
+  %2 = load <3 x double>, <3 x double>* %val
   %3 = shufflevector <3 x double> %2, <3 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   %4 = bitcast i8* %1 to <4 x double>*
   store <4 x double> %3, <4 x double>* %4, align 32, !tbaa !47
@@ -642,7 +642,7 @@
 define void @rsGetElementAtImpl_double3(<3 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
-  %3 = load <4 x double>* %2, align 32
+  %3 = load <4 x double>, <4 x double>* %2, align 32
   %4 = bitcast <3 x double>* %agg.result to <4 x double>*
   store <4 x double> %3, <4 x double>* %4, align 32, !tbaa !59
   ret void
@@ -651,7 +651,7 @@
 !60 = !{!"double4", !15}
 define void @rsSetElementAtImpl_double4(%struct.rs_allocation* nocapture readonly %a, <4 x double>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x double>* %val
+  %2 = load <4 x double>, <4 x double>* %val
   %3 = bitcast i8* %1 to <4 x double>*
   store <4 x double> %2, <4 x double>* %3, align 32, !tbaa !60
   ret void
@@ -659,7 +659,7 @@
 define void @rsGetElementAtImpl_double4(<4 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 32, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
-  %3 = load <4 x double>* %2, align 32, !tbaa !15
+  %3 = load <4 x double>, <4 x double>* %2, align 32, !tbaa !15
   store <4 x double> %3, <4 x double>* %agg.result, align 32, !tbaa !60
   ret void
 }
@@ -668,210 +668,210 @@
 define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 8
+  %3 = load <4 x i64>, <4 x i64>* %2, align 8
   store <4 x i64> %3, <4 x i64>* %agg.result
   ret void
 }
 define void @__rsAllocationVLoadXImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
+  %3 = load <3 x i64>, <3 x i64>* %2, align 8
   store <3 x i64> %3, <3 x i64>* %agg.result
   ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_long2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i64>*
-  %3 = load <2 x i64>* %2, align 8
+  %3 = load <2 x i64>, <2 x i64>* %2, align 8
   ret <2 x i64> %3
 }
 
 define void @__rsAllocationVLoadXImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
-  %3 = load <4 x i64>* %2, align 8
+  %3 = load <4 x i64>, <4 x i64>* %2, align 8
   store <4 x i64> %3, <4 x i64>* %agg.result
   ret void
 }
 define void @__rsAllocationVLoadXImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
+  %3 = load <3 x i64>, <3 x i64>* %2, align 8
   store <3 x i64> %3, <3 x i64>* %agg.result
   ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_ulong2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i64>*
-  %3 = load <2 x i64>* %2, align 8
+  %3 = load <2 x i64>, <2 x i64>* %2, align 8
   ret <2 x i64> %3
 }
 
 define <4 x i32> @__rsAllocationVLoadXImpl_int4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 4
+  %3 = load <4 x i32>, <4 x i32>* %2, align 4
   ret <4 x i32> %3
 }
 define <3 x i32> @__rsAllocationVLoadXImpl_int3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i32>*
-  %3 = load <3 x i32>* %2, align 4
+  %3 = load <3 x i32>, <3 x i32>* %2, align 4
   ret <3 x i32> %3
 }
 define <2 x i32> @__rsAllocationVLoadXImpl_int2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i32>*
-  %3 = load <2 x i32>* %2, align 4
+  %3 = load <2 x i32>, <2 x i32>* %2, align 4
   ret <2 x i32> %3
 }
 
 define <4 x i32> @__rsAllocationVLoadXImpl_uint4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i32>*
-  %3 = load <4 x i32>* %2, align 4
+  %3 = load <4 x i32>, <4 x i32>* %2, align 4
   ret <4 x i32> %3
 }
 define <3 x i32> @__rsAllocationVLoadXImpl_uint3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i32>*
-  %3 = load <3 x i32>* %2, align 4
+  %3 = load <3 x i32>, <3 x i32>* %2, align 4
   ret <3 x i32> %3
 }
 define <2 x i32> @__rsAllocationVLoadXImpl_uint2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i32>*
-  %3 = load <2 x i32>* %2, align 4
+  %3 = load <2 x i32>, <2 x i32>* %2, align 4
   ret <2 x i32> %3
 }
 
 define <4 x i16> @__rsAllocationVLoadXImpl_short4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 2
+  %3 = load <4 x i16>, <4 x i16>* %2, align 2
   ret <4 x i16> %3
 }
 define <3 x i16> @__rsAllocationVLoadXImpl_short3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i16>*
-  %3 = load <3 x i16>* %2, align 2
+  %3 = load <3 x i16>, <3 x i16>* %2, align 2
   ret <3 x i16> %3
 }
 define <2 x i16> @__rsAllocationVLoadXImpl_short2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  %3 = load <2 x i16>* %2, align 2
+  %3 = load <2 x i16>, <2 x i16>* %2, align 2
   ret <2 x i16> %3
 }
 
 define <4 x i16> @__rsAllocationVLoadXImpl_ushort4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i16>*
-  %3 = load <4 x i16>* %2, align 2
+  %3 = load <4 x i16>, <4 x i16>* %2, align 2
   ret <4 x i16> %3
 }
 define <3 x i16> @__rsAllocationVLoadXImpl_ushort3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i16>*
-  %3 = load <3 x i16>* %2, align 2
+  %3 = load <3 x i16>, <3 x i16>* %2, align 2
   ret <3 x i16> %3
 }
 define <2 x i16> @__rsAllocationVLoadXImpl_ushort2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i16>*
-  %3 = load <2 x i16>* %2, align 2
+  %3 = load <2 x i16>, <2 x i16>* %2, align 2
   ret <2 x i16> %3
 }
 
 define <4 x i8> @__rsAllocationVLoadXImpl_char4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 1
+  %3 = load <4 x i8>, <4 x i8>* %2, align 1
   ret <4 x i8> %3
 }
 define <3 x i8> @__rsAllocationVLoadXImpl_char3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i8>*
-  %3 = load <3 x i8>* %2, align 1
+  %3 = load <3 x i8>, <3 x i8>* %2, align 1
   ret <3 x i8> %3
 }
 define <2 x i8> @__rsAllocationVLoadXImpl_char2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i8>*
-  %3 = load <2 x i8>* %2, align 1
+  %3 = load <2 x i8>, <2 x i8>* %2, align 1
   ret <2 x i8> %3
 }
 
 define <4 x i8> @__rsAllocationVLoadXImpl_uchar4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i8>*
-  %3 = load <4 x i8>* %2, align 1
+  %3 = load <4 x i8>, <4 x i8>* %2, align 1
   ret <4 x i8> %3
 }
 define <3 x i8> @__rsAllocationVLoadXImpl_uchar3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i8>*
-  %3 = load <3 x i8>* %2, align 1
+  %3 = load <3 x i8>, <3 x i8>* %2, align 1
   ret <3 x i8> %3
 }
 define <2 x i8> @__rsAllocationVLoadXImpl_uchar2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x i8>*
-  %3 = load <2 x i8>* %2, align 1
+  %3 = load <2 x i8>, <2 x i8>* %2, align 1
   ret <2 x i8> %3
 }
 
 define <4 x float> @__rsAllocationVLoadXImpl_float4(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x float>*
-  %3 = load <4 x float>* %2, align 4
+  %3 = load <4 x float>, <4 x float>* %2, align 4
   ret <4 x float> %3
 }
 define <3 x float> @__rsAllocationVLoadXImpl_float3(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x float>*
-  %3 = load <3 x float>* %2, align 4
+  %3 = load <3 x float>, <3 x float>* %2, align 4
   ret <3 x float> %3
 }
 define <2 x float> @__rsAllocationVLoadXImpl_float2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x float>*
-  %3 = load <2 x float>* %2, align 4
+  %3 = load <2 x float>, <2 x float>* %2, align 4
   ret <2 x float> %3
 }
 
 define void @__rsAllocationVLoadXImpl_double4(<4 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
-  %3 = load <4 x double>* %2, align 8
+  %3 = load <4 x double>, <4 x double>* %2, align 8
   store <4 x double> %3, <4 x double>* %agg.result
   ret void
 }
 define void @__rsAllocationVLoadXImpl_double3(<3 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x double>*
-  %3 = load <3 x double>* %2, align 8
+  %3 = load <3 x double>, <3 x double>* %2, align 8
   store <3 x double> %3, <3 x double>* %agg.result
   ret void
 }
 define <2 x double> @__rsAllocationVLoadXImpl_double2(%struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <2 x double>*
-  %3 = load <2 x double>* %2, align 8
+  %3 = load <2 x double>, <2 x double>* %2, align 8
   ret <2 x double> %3
 }
 
 
 define void @__rsAllocationVStoreXImpl_long4(%struct.rs_allocation* nocapture readonly %a, <4 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x i64>* %val
+  %2 = load <4 x i64>, <4 x i64>* %val
   %3 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %2, <4 x i64>* %3, align 8
   ret void
 }
 define void @__rsAllocationVStoreXImpl_long3(%struct.rs_allocation* nocapture readonly %a, <3 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x i64>* %val
+  %2 = load <3 x i64>, <3 x i64>* %val
   %3 = bitcast i8* %1 to <3 x i64>*
   store <3 x i64> %2, <3 x i64>* %3, align 8
   ret void
@@ -885,14 +885,14 @@
 
 define void @__rsAllocationVStoreXImpl_ulong4(%struct.rs_allocation* nocapture readonly %a, <4 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x i64>* %val
+  %2 = load <4 x i64>, <4 x i64>* %val
   %3 = bitcast i8* %1 to <4 x i64>*
   store <4 x i64> %2, <4 x i64>* %3, align 8
   ret void
 }
 define void @__rsAllocationVStoreXImpl_ulong3(%struct.rs_allocation* nocapture readonly %a, <3 x i64>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x i64>* %val
+  %2 = load <3 x i64>, <3 x i64>* %val
   %3 = bitcast i8* %1 to <3 x i64>*
   store <3 x i64> %2, <3 x i64>* %3, align 8
   ret void
@@ -1053,14 +1053,14 @@
 
 define void @__rsAllocationVStoreXImpl_double4(%struct.rs_allocation* nocapture readonly %a, <4 x double>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <4 x double>* %val
+  %2 = load <4 x double>, <4 x double>* %val
   %3 = bitcast i8* %1 to <4 x double>*
   store <4 x double> %2, <4 x double>* %3, align 8
   ret void
 }
 define void @__rsAllocationVStoreXImpl_double3(%struct.rs_allocation* nocapture readonly %a, <3 x double>* %val, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
-  %2 = load <3 x double>* %val
+  %2 = load <3 x double>, <3 x double>* %val
   %3 = bitcast i8* %1 to <3 x double>*
   store <3 x double> %2, <3 x double>* %3, align 8
   ret void