diff --git a/driver/runtime/build_bc_lib_internal.mk b/driver/runtime/build_bc_lib_internal.mk
index a44461b..b024ecd 100644
--- a/driver/runtime/build_bc_lib_internal.mk
+++ b/driver/runtime/build_bc_lib_internal.mk
@@ -31,7 +31,7 @@
 bc_clang := $(RS_DRIVER_CLANG_EXE)
 endif
 
-bc_clang_cc1_cflags :=
+bc_clang_cc1_cflags := -fnative-half-type -fallow-half-arguments-and-returns
 ifeq ($(BCC_RS_TRIPLE),armv7-none-linux-gnueabi)
 # We need to pass the +long64 flag to the underlying version of Clang, since
 # we are generating a library for use with Renderscript (64-bit long type,
diff --git a/driver/runtime/ll32/allocation.ll b/driver/runtime/ll32/allocation.ll
index 1ba8222..bab40c8 100644
--- a/driver/runtime/ll32/allocation.ll
+++ b/driver/runtime/ll32/allocation.ll
@@ -654,6 +654,68 @@
   ret void
 }
 
+!61 = !{!"half", !15}
+define void @rsSetElementAtImpl_half([1 x i32] %a.coerce, half %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 2, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to half*
+  store half %val, half* %2, align 2, !tbaa !61
+  ret void
+}
+
+define half @rsGetElementAtImpl_half([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 2, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to half*
+  %3 = load half, half* %2, align 2, !tbaa !61
+  ret half %3
+}
+
+!62 = !{!"half2", !15}
+define void @rsSetElementAtImpl_half2([1 x i32] %a.coerce, <2 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <2 x half>*
+  store <2 x half> %val, <2 x half>* %2, align 4, !tbaa !62
+  ret void
+}
+
+define <2 x half> @rsGetElementAtImpl_half2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <2 x half>*
+  %3 = load <2 x half>, <2 x half>* %2, align 4, !tbaa !62
+  ret <2 x half> %3
+}
+
+!63 = !{!"half3", !15}
+define void @rsSetElementAtImpl_half3([1 x i32] %a.coerce, <3 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 8, i32 %x, i32 %y, i32 %z) #2
+  %2 = shufflevector <3 x half> %val, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %3 = bitcast i8* %1 to <4 x half>*
+  store <4 x half> %2, <4 x half>* %3, align 8, !tbaa !63
+  ret void
+}
+
+define void @rsGetElementAtImpl_half3(<3 x half>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 32, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  %3 = load <4 x half>, <4 x half>* %2, align 8
+  %4 = bitcast <3 x half>* %agg.result to <4 x half>*
+  store <4 x half> %3, <4 x half>* %4, align 8, !tbaa !63
+  ret void
+}
+
+!64 = !{!"half4", !15}
+define void @rsSetElementAtImpl_half4([1 x i32] %a.coerce, <4 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  store <4 x half> %val, <4 x half>* %2, align 8, !tbaa !64
+  ret void
+}
+
+define <4 x half> @rsGetElementAtImpl_half4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset([1 x i32] %a.coerce, i32 8, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  %3 = load <4 x half>, <4 x half>* %2, align 8, !tbaa !64
+  ret <4 x half> %3
+}
 
 define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
diff --git a/driver/runtime/ll64/allocation.ll b/driver/runtime/ll64/allocation.ll
index adb385c..ad18874 100644
--- a/driver/runtime/ll64/allocation.ll
+++ b/driver/runtime/ll64/allocation.ll
@@ -669,6 +669,69 @@
   ret void
 }
 
+!61 = !{!"half", !15}
+define void @rsSetElementAtImpl_half(%struct.rs_allocation* nocapture readonly %a.coerce, half %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 2, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to half*
+  store half %val, half* %2, align 2, !tbaa !61
+  ret void
+}
+
+define half @rsGetElementAtImpl_half(%struct.rs_allocation* nocapture readonly %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 2, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to half*
+  %3 = load half, half* %2, align 2, !tbaa !61
+  ret half %3
+}
+
+!62 = !{!"half2", !15}
+define void @rsSetElementAtImpl_half2(%struct.rs_allocation* nocapture readonly %a.coerce, <2 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <2 x half>*
+  store <2 x half> %val, <2 x half>* %2, align 4, !tbaa !62
+  ret void
+}
+
+define <2 x half> @rsGetElementAtImpl_half2(%struct.rs_allocation* nocapture readonly %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <2 x half>*
+  %3 = load <2 x half>, <2 x half>* %2, align 4, !tbaa !62
+  ret <2 x half> %3
+}
+
+!63 = !{!"half3", !15}
+define void @rsSetElementAtImpl_half3(%struct.rs_allocation* nocapture readonly %a.coerce, <3 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 8, i32 %x, i32 %y, i32 %z) #2
+  %2 = shufflevector <3 x half> %val, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %3 = bitcast i8* %1 to <4 x half>*
+  store <4 x half> %2, <4 x half>* %3, align 8, !tbaa !63
+  ret void
+}
+
+define void @rsGetElementAtImpl_half3(<3 x half>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 32, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  %3 = load <4 x half>, <4 x half>* %2, align 8
+  %4 = bitcast <3 x half>* %agg.result to <4 x half>*
+  store <4 x half> %3, <4 x half>* %4, align 8, !tbaa !63
+  ret void
+}
+
+!64 = !{!"half4", !15}
+define void @rsSetElementAtImpl_half4(%struct.rs_allocation* nocapture readonly %a.coerce, <4 x half> %val, i32 %x, i32 %y, i32 %z) #1 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 4, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  store <4 x half> %val, <4 x half>* %2, align 8, !tbaa !64
+  ret void
+}
+
+define <4 x half> @rsGetElementAtImpl_half4(%struct.rs_allocation* nocapture readonly %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+  %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a.coerce, i32 8, i32 %x, i32 %y, i32 %z) #2
+  %2 = bitcast i8* %1 to <4 x half>*
+  %3 = load <4 x half>, <4 x half>* %2, align 8, !tbaa !64
+  ret <4 x half> %3
+}
+
 
 define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
diff --git a/driver/runtime/rs_allocation.c b/driver/runtime/rs_allocation.c
index 7755e97..8c8d1ba 100644
--- a/driver/runtime/rs_allocation.c
+++ b/driver/runtime/rs_allocation.c
@@ -272,6 +272,10 @@
 ELEMENT_AT(ulong2)
 ELEMENT_AT(ulong3)
 ELEMENT_AT(ulong4)
+ELEMENT_AT(half)
+ELEMENT_AT(half2)
+ELEMENT_AT(half3)
+ELEMENT_AT(half4)
 ELEMENT_AT(float)
 ELEMENT_AT(float2)
 ELEMENT_AT(float3)
