mirror of
				https://git.eden-emu.dev/eden-emu/eden.git
				synced 2025-10-25 19:03:17 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			303 lines
		
	
	
	
		
			5.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			303 lines
		
	
	
	
		
			5.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| .global _rsqrt_inaccurate
 | |
| .global rsqrt_inaccurate
 | |
| .global _rsqrt_full
 | |
| .global rsqrt_full
 | |
| .global _rsqrt_full_gpr
 | |
| .global rsqrt_full_gpr
 | |
| .global _rsqrt_full_nb
 | |
| .global rsqrt_full_nb
 | |
| .global _rsqrt_full_nb2
 | |
| .global rsqrt_full_nb2
 | |
| .global _rsqrt_full_nb_gpr
 | |
| .global rsqrt_full_nb_gpr
 | |
| .global _rsqrt_newton
 | |
| .global rsqrt_newton
 | |
| .global _rsqrt_hack
 | |
| .global rsqrt_hack
 | |
| .global _rsqrt_fallback
 | |
| 
 | |
| .text
 | |
| .intel_syntax noprefix
 | |
| 
 | |
| .align 16
 | |
| min_pos_denorm:
 | |
| .long 0x00800000,0,0,0
 | |
| penultimate_bit:
 | |
| .long 0x00008000,0,0,0
 | |
| ultimate_bit:
 | |
| .long 0x00004000,0,0,0
 | |
| top_mask:
 | |
| .long 0xFFFF8000,0,0,0
 | |
| one:
 | |
| .long 0x3f800000,0,0,0
 | |
| half:
 | |
| .long 0x3f000000,0,0,0
 | |
| one_point_five:
 | |
| .long 0x3fc00000,0,0,0
 | |
| magic1:
 | |
| .long 0x60000000,0,0,0
 | |
| magic2:
 | |
| .long 0x3c000000,0,0,0
 | |
| magic3:
 | |
| .long 0x000047ff,0,0,0
 | |
| 
 | |
| _rsqrt_inaccurate:
 | |
| rsqrt_inaccurate:
 | |
|     movd xmm0, edi
 | |
| 
 | |
|     rsqrtss xmm0, xmm0
 | |
| 
 | |
|     movd eax, xmm0
 | |
|     ret
 | |
| 
 | |
| _rsqrt_full:
 | |
| rsqrt_full:
 | |
|     movd xmm0, edi
 | |
| 
 | |
|     pand xmm0, [rip + top_mask]
 | |
|     por xmm0, [rip + penultimate_bit]
 | |
| 
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad
 | |
| 
 | |
|     sqrtss xmm0, xmm0
 | |
| 
 | |
|     movd xmm1, [rip + one]
 | |
|     divss xmm1, xmm0
 | |
| 
 | |
|     paddd xmm1, [rip + ultimate_bit]
 | |
|     pand xmm1, [rip + top_mask]
 | |
| 
 | |
|     movd eax, xmm1
 | |
|     ret
 | |
| 
 | |
| _rsqrt_full_gpr:
 | |
| rsqrt_full_gpr:
 | |
|     movd eax, xmm0 # Emulate regalloc mov
 | |
| 
 | |
|     mov eax, edi
 | |
|     and eax, 0xFFFF8000
 | |
|     or eax, 0x00008000
 | |
| 
 | |
|     movd xmm0, eax
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad
 | |
| 
 | |
|     sqrtss xmm0, xmm0
 | |
| 
 | |
|     movd xmm1, [rip + one]
 | |
|     divss xmm1, xmm0
 | |
|     movd eax, xmm1
 | |
| 
 | |
|     add eax, 0x00004000
 | |
|     and eax, 0xffff8000
 | |
| 
 | |
|     movd xmm0, eax # Emulate regalloc mov
 | |
|     ret
 | |
| 
 | |
| _rsqrt_full_nb2:
 | |
| rsqrt_full_nb2:
 | |
|     movd xmm0, edi
 | |
| 
 | |
|     pand xmm0, [rip + top_mask]
 | |
|     por xmm0, [rip + penultimate_bit]
 | |
| 
 | |
|     ucomiss xmm0, [rip + min_pos_denorm]
 | |
|     jna rsqrt_full_bad_new1
 | |
| 
 | |
|     sqrtss xmm0, xmm0
 | |
| 
 | |
|     movd xmm1, [rip + one]
 | |
|     divss xmm1, xmm0
 | |
| 
 | |
|     paddd xmm1, [rip + ultimate_bit]
 | |
|     pand xmm1, [rip + top_mask]
 | |
| 
 | |
|     movd eax, xmm1
 | |
|     ret
 | |
| 
 | |
| _rsqrt_full_nb:
 | |
| rsqrt_full_nb:
 | |
|     movd xmm0, edi
 | |
| 
 | |
|     pand xmm0, [rip + top_mask]
 | |
|     por xmm0, [rip + penultimate_bit]
 | |
| 
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad_new1
 | |
| 
 | |
|     sqrtss xmm0, xmm0
 | |
| 
 | |
|     movd xmm1, [rip + one]
 | |
|     divss xmm1, xmm0
 | |
| 
 | |
|     paddd xmm1, [rip + ultimate_bit]
 | |
|     pand xmm1, [rip + top_mask]
 | |
| 
 | |
|     movd eax, xmm1
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new1:
 | |
|     cmp edi, 0x00800000
 | |
|     jb rsqrt_full_bad_new_fallback1
 | |
| 
 | |
|     movd xmm0, edi
 | |
|     rsqrtss xmm1, xmm0
 | |
| 
 | |
|     ucomiss xmm1, xmm1
 | |
|     jp rsqrt_full_bad_new1_nan
 | |
| 
 | |
|     movd eax, xmm1
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new_fallback1:
 | |
|     call _rsqrt_fallback
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new1_nan:
 | |
|     ucomiss xmm0, xmm0
 | |
|     jp rsqrt_full_bad_new1_nan_ret
 | |
| 
 | |
|     mov eax, 0x7FC00000
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new1_nan_ret:
 | |
|     ret
 | |
| 
 | |
| _rsqrt_full_nb_gpr:
 | |
| rsqrt_full_nb_gpr:
 | |
|     movd eax, xmm0 # Emulate regalloc mov
 | |
| 
 | |
|     mov eax, edi
 | |
|     and eax, 0xFFFF8000
 | |
|     or eax, 0x00008000
 | |
| 
 | |
|     movd xmm0, eax
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad_new2
 | |
| 
 | |
|     sqrtss xmm0, xmm0
 | |
| 
 | |
|     movd xmm1, [rip + one]
 | |
|     divss xmm1, xmm0
 | |
|     movd eax, xmm1
 | |
| 
 | |
|     add eax, 0x00004000
 | |
|     and eax, 0xffff8000
 | |
| 
 | |
|     movd xmm0, eax # Emulate regalloc mov
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new2:
 | |
|     cmp edi, 0x00800000
 | |
|     jb rsqrt_full_bad_new_fallback2
 | |
| 
 | |
|     movd xmm0, edi
 | |
|     rsqrtss xmm1, xmm0
 | |
| 
 | |
|     test edi, edi
 | |
|     js rsqrt_full_bad_new2_nan
 | |
| 
 | |
|     movd eax, xmm1
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new_fallback2:
 | |
|     call _rsqrt_fallback
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad_new2_nan:
 | |
|     mov eax, 0x7FC00000
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_bad:
 | |
|     xorps xmm1, xmm1
 | |
|     movd xmm0, edi
 | |
|     ucomiss xmm0, xmm1
 | |
|     jp rsqrt_full_nan
 | |
|     je rsqrt_full_zero
 | |
|     jc rsqrt_full_neg
 | |
| 
 | |
|     cmp edi, 0x7F800000
 | |
|     je rsqrt_full_inf
 | |
| 
 | |
|     # TODO: Full Denormal Implementation
 | |
|     call _rsqrt_fallback
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_neg:
 | |
|     mov eax, 0x7FC00000
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_inf:
 | |
|     xor eax, eax
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_nan:
 | |
|     mov eax, edi
 | |
|     or eax, 0x00400000
 | |
|     ret
 | |
| 
 | |
| rsqrt_full_zero:
 | |
|     mov eax, edi
 | |
|     or eax, 0x7F800000
 | |
|     ret
 | |
| 
 | |
| _rsqrt_newton:
 | |
| rsqrt_newton:
 | |
|     movd xmm0, edi
 | |
| 
 | |
|     pand xmm0, [rip + top_mask]
 | |
|     por xmm0, [rip + penultimate_bit]
 | |
| 
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad
 | |
| 
 | |
|     rsqrtps xmm1, xmm0
 | |
|     mulss xmm0, [rip + half]
 | |
|     vmulss xmm2, xmm1, xmm1
 | |
|     mulss xmm2, xmm0
 | |
|     movaps xmm0, [rip + one_point_five]
 | |
|     subss xmm0, xmm2
 | |
|     mulss xmm0, xmm1
 | |
| 
 | |
|     paddd xmm0, [rip + ultimate_bit]
 | |
|     pand xmm0, [rip + top_mask]
 | |
| 
 | |
|     movd eax, xmm0
 | |
|     ret
 | |
| 
 | |
| _rsqrt_hack:
 | |
| rsqrt_hack:
 | |
|     movd xmm9, edi
 | |
| 
 | |
|     vpand xmm0, xmm9, [rip + top_mask]
 | |
|     por xmm0, [rip + penultimate_bit]
 | |
| 
 | |
|     # detect NaNs, negatives, zeros, denormals and infinities
 | |
|     vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 | |
|     ptest xmm1, xmm1
 | |
|     jnz rsqrt_full_bad
 | |
| 
 | |
|     # calculate x64 estimate
 | |
|     rsqrtps xmm0, xmm0
 | |
| 
 | |
|     # calculate correction factor
 | |
|     vpslld xmm1, xmm9, 8
 | |
|     vpsrad xmm2, xmm1, 31
 | |
|     paddd xmm1, [rip + magic1]
 | |
|     pcmpgtd xmm1, [rip + magic2]
 | |
|     pxor xmm1, xmm2
 | |
|     movaps xmm2, [rip + magic3]
 | |
|     psubd xmm2, xmm1
 | |
| 
 | |
|     # correct x64 estimate
 | |
|     paddd xmm0, xmm2
 | |
|     pand xmm0, [rip + top_mask]
 | |
| 
 | |
|     movd eax, xmm0
 | |
|     ret
 |