ARM Memory Copy

  1         MODULE  ARM_MEMORY
  2 
  3         PUBLIC  ARM_MEMCPY
  4         PUBLIC  ARM_MEMSET
  5         PUBLIC  ARM_MEMSET8
  6         PUBLIC  ARM_MEMSET16
  7         PUBLIC  ARM_MEMSET32
  8 
  9         SECTION .text:CODE:NOROOT(2)
 10         CODE32
 11 
 12 ;-------------------------------------------------------------------------------
 13 ; void ARM_MEMCPY(void* pDest, void* pSrc, U32 NumBytes)
 14 ;
 15 ; Function description
 16 ;   Copy data in memory from source address to destination address.
 17 ;
 18 ; Register usage:
 19 ;
 20 ;   R0    pDest
 21 ;   R1    pSrc
 22 ;   R2    NumBytes
 23 ;
 24 ;   R3    Used for data transfers
 25 ;   R4    Used for data transfers
 26 ;   R12   Used for data transfers
 27 ;   R14   Used for data transfers
 28 ;
 29 ;   R13   SP
 30 ;   R14   LR (contains return address)
 31 ;   R15   PC
 32 ;
 33 ;-------------------------------------------------------------------------------
 34 ARM_MEMCPY:
 35 ;-------------------------------------------------------------------------------
 36         cmp         R2, #+3                           ; R2 = NumBytes
 37         bls         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word, use single byte transfer
 38 
 39         ands        R12, R0, #+3                      ; R0 = destination address
 40         beq         ARM_MEMCPY_DestIsDWordAligned     ; Is destination address already word aligned ?
 41 
 42 ;-------------------------------------------------------------------------------
 43 ; Handle as much bytes as necessary to align destination address
 44 ;
 45         ldrb        R3, [R1], #+1                     ; We need at least one byte to the next word alignment, so we read one.
 46         cmp         R12, #+2                          ; Set condition codes according to the mis-alignment
 47         add         R2, R2, R12                       ; Adjust NumBytes : 1, 2, 3
 48         ldrbls      R12, [R1], #+1                    ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
 49         strb        R3, [R0], #+1
 50         ldrbcc      R3, [R1], #+1                     ; Carry clear (CC)? -> We need one more byte
 51         strbls      R12, [R0], #+1
 52         sub         R2, R2, #+4                       ; Adjust NumBytes
 53         strbcc      R3, [R0], #+1                     ; now destination address already is word aligned
 54 
 55 ;-------------------------------------------------------------------------------
 56 ; Choose best way to transfer data
 57 ;
 58 ARM_MEMCPY_DestIsDWordAligned:
 59         ands        R3, R1, #+3
 60         beq         ARM_MEMCPY_HandleBulkWordData     ; If source and destination are aligned, use bulk word transfer
 61 
 62         subs        R2, R2, #+4
 63         bcc         ARM_MEMCPY_HandleTrailingBytes    ; If we have less than one complete word left, use single byte transfer
 64 
 65         ldr         R12, [R1, -R3]!                   ; Read first mis-aligned data word and word align source address
 66         cmp         R3, #+2
 67         beq         ARM_MEMCPY_Loop16BitShift
 68 
 69         bhi         ARM_MEMCPY_Loop24BitShift
 70 
 71 ;-------------------------------------------------------------------------------
 72 ; Handle data in units of word
 73 ;
 74 ; This is done by reading mis-aligned words from source address and
 75 ; shift them into the right alignment. After this the next data word
 76 ; will be read to complete the missing data part.
 77 ;
 78 ARM_MEMCPY_Loop8BitShift:
 79         mov         R3, R12, LSR #+8           ; Shift data word into right position
 80         ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
 81         subs        R2, R2, #+4                ; Decrement NumBytes
 82         orr         R3, R3, R12, LSL #+24      ; Combine missing part of data to build full data word
 83         str         R3, [R0], #+4              ; Store complete word
 84         bcs         ARM_MEMCPY_Loop8BitShift
 85 
 86         add         R1, R1, #+1                ; Adjust source address
 87         b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes
 88 
 89 ARM_MEMCPY_Loop16BitShift:
 90         mov         R3, R12, LSR #+16          ; Shift data word into right position
 91         ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
 92         subs        R2, R2, #+4                ; Decrement NumBytes
 93         orr         R3, R3, R12, LSL #+16      ; Combine missing part of data to build full data word
 94         str         R3, [R0], #+4              ; Store complete word
 95         bcs         ARM_MEMCPY_Loop16BitShift
 96 
 97         add         R1, R1, #+2                ; Adjust source address
 98         b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes
 99 
100 ARM_MEMCPY_Loop24BitShift:
101         mov         R3, R12, LSR #+24          ; Shift data word into right position
102         ldr         R12, [R1, #+4]!            ; Load next mis-aligned data word
103         subs        R2, R2, #+4                ; Decrement NumBytes
104         orr         R3, R3, R12, LSL #+8       ; Combine missing part of data to build full data word
105         str         R3, [R0], #+4              ; Store complete word
106         bcs         ARM_MEMCPY_Loop24BitShift
107 
108         add         R1, R1, #+3                ; Adjust source address
109         b           ARM_MEMCPY_HandleTrailingBytes         ; Handle trailing bytes
110 
111 ;-------------------------------------------------------------------------------
112 ; Handle large bulk data in blocks of 8 words (32 bytes)
113 ;
114 ARM_MEMCPY_HandleBulkWordData:
115         subs        R2, R2, #+0x20
116         stmdb       SP!, {R4, LR}
117         bcc         ARM_MEMCPY_HandleTrailingWords
118 
119 ARM_MEMCPY_LoopHandleBulkWord:
120         ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
121         stm         R0!, {R3, R4, R12, LR}
122         ldm         R1!, {R3, R4, R12, LR}     ; Transfer 16 bytes at once
123         stm         R0!, {R3, R4, R12, LR}
124         subs        R2, R2, #+0x20
125         bcs         ARM_MEMCPY_LoopHandleBulkWord
126 
127 ;-------------------------------------------------------------------------------
128 ; Handle trailing 7 words
129 ;
130 ARM_MEMCPY_HandleTrailingWords:
131         movs        R12, R2, LSL #28           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
132 
133         ldmcs       R1!, {R3, R4, R12, LR}     ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
134         stmcs       R0!, {R3, R4, R12, LR}
135         ldmmi       R1!, {R3, R4}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)
136         stmmi       R0!, {R3, R4}
137 
138         movs        R12, R2, LSL #+30          ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
139 
140         ldmia       SP!, {R4, LR}
141         ldrcs       R3, [R1], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)
142         strcs       R3, [R0], #+4
143         bxeq        LR
144 
145 ;-------------------------------------------------------------------------------
146 ; Handle trailing 3 bytes
147 ;
148 ; N Z C V Q  ***** I F T M4 3 2 1 0
149 ; N = bit[31]
150 ; C = last shift bit : shift
151 ; C = 1 ADD/CMN has carry bit
152 ; C = 0 SUB/CMP no borrow bit
153 ; xxxxxxxxxxxxxxxxxxxx10 << 31 : N=0, C=1
154 ; xxxxxxxxxxxxxxxxxxxx01 << 31 : N=1, C=0
155 ; BMI : N=1
156 ; BCS : C=1
157 ARM_MEMCPY_HandleTrailingBytes:
158         movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
159 
160         ldrbmi      R2, [R1], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
161         ldrbcs      R3, [R1], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
162         ldrbcs      R12, [R1], #+1
163         strbmi      R2, [R0], #+1
164         strbcs      R3, [R0], #+1
165         strbcs      R12, [R0], #+1
166         bx          LR
167 
168 
169 ;-------------------------------------------------------------------------------
170 ; void ARM_MEMSET(void* pDest, U32 c, U32 NumBytes)
171 ;
172 ; Function description
173 ;   Copy data in memory from source address to destination address.
174 ;
175 ; Register usage:
176 ;
177 ;   R0    pDest
178 ;   R1    c
179 ;   R2    NumBytes
180 ;
181 ;   R3    Used for data transfers
182 ;   R4    Used for data transfers
183 ;   R5    Used for data transfers
184 ;   R6    Used for data transfers
185 ;
186 ;   R13   SP
187 ;   R14   LR (contains return address)
188 ;   R15   PC
189 ;
190 ;-------------------------------------------------------------------------------
191 ARM_MEMSET:
192 ;-------------------------------------------------------------------------------
193         orr         R1, R1, R1, LSL #+8
194         orr         R1, R1, R1, LSL #+16
195 
196         cmp         R2, #+3                           ; R2 = NumBytes
197         bls         ARM_MEMSET_HandleTrailingBytes    ; If we have less than one complete word, use single byte transfer
198 
199         ands        R3, R0, #+3                       ; R0 = destination address
200         beq         ARM_MEMSET_DestIsAligned          ; Is destination address already word aligned ?
201 
202 ; Handle as much bytes as necessary to align destination address
203 
204         strb        R1, [R0], #+1              ; We need at least one byte to the next word alignment, so we read one.
205         cmp         R3, #+2                    ; Set condition codes according to the mis-alignment
206         add         R2, R2, R3                 ; Adjust NumBytes
207         strbls      R1, [R0], #+1              ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
208         sub         R2, R2, #+4                ; Adjust NumBytes
209         strbcc      R1, [R0], #+1              ; Carry clear (CC)? -> We need one more byte
210 
211 ; Choose best way to transfer data
212 
213 ARM_MEMSET_DestIsAligned:                      ; destination is aligned, use bulk word transfer
214 
215 ; Handle large bulk data in blocks of 8 words (32 bytes)
216 
217 ARM_MEMSET_HandleBulkWordData:
218         stmdb       SP!, {R4, R5, R6}
219 
220         mov         R3, R1, LSL #+0           ; Transfer 16 bytes at once
221         mov         R4, R1, LSL #+0
222         mov         R5, R1, LSL #+0
223 
224         subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
225         bcc         ARM_MEMSET_HandleTrailingWords
226 
227 ARM_MEMSET_LoopHandleBulkWord:
228         stm         R0!, {R1, R3, R4, R5}
229         stm         R0!, {R1, R3, R4, R5}
230         subs        R2, R2, #+0x20
231         bcs         ARM_MEMSET_LoopHandleBulkWord
232 
233 
234 ; Handle trailing 7 words
235 
236 ARM_MEMSET_HandleTrailingWords:
237         movs        R6, R2, LSL #28            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
238         stmcs       R0!, {R1, R3, R4, R5}      ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is set)
239         stmmi       R0!, {R1, R3}              ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is set)
240 
241         movs        R6, R2, LSL #+30           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
242         strcs       R1, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is set)
243 
244         ldmia       SP!, {R4, R5, R6}
245         bxeq        LR                         ; Z flag contain no Trailing Bytes
246 
247 
248 ; Handle trailing 3 bytes
249 
250 ARM_MEMSET_HandleTrailingBytes:
251         movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
252         strbmi      R1, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is set)
253         strbcs      R1, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is set)
254         strbcs      R1, [R0], #+1
255         bx          LR
256 
257 
258 ;      int ARM_MEMSET8(void* pDest, U32 c, U32 NumBytes);
259 ;-------------------------------------------------------------------------------
260 ARM_MEMSET8:
261 ;-------------------------------------------------------------------------------
262         stmdb       SP!, {R4, R5}
263         cmp         R2, #4
264         blt         ARM_MEMSET8_loop3
265 
266         ; Alignment is unknown
267         tst         R0, #1
268         strneb      R1, [R0], #1
269         subne       R2, R2, #1
270 
271         ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
272         orr         R1, R1, R1, LSL #8
273         tst         R0, #2
274         strneh      R1, [R0], #2
275         subne       R2, R2, #2
276 
277         ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
278         orr         R1, R1, R1, LSL #16
279         mov         R3, R1
280         cmp         R2, #16
281         blt         ARM_MEMSET8_loop2
282         tst         R0, #4
283         strne       R1, [R0], #4
284         subne       R2, R2, #4
285         tst         R0, #8
286         stmneia     R0!, {R1, R3}
287         subne       R2, R2, #8
288 
289         ; Now we are 128-bit aligned
290         mov         R4, R1
291         mov         R5, R1
292 ARM_MEMSET8_loop1:
293         ; Copy 4 32-bit values per loop iteration
294         subs        R2, R2, #16
295         stmgeia     R0!, {R1, R3, R4, R5}
296         bge         ARM_MEMSET8_loop1
297         add         R2, R2, #16
298 
299 ARM_MEMSET8_loop2:
300         ; Copy up to 3 remaining 32-bit values
301         tst         R2, #8
302         stmneia     R0!, {R1, R3}
303         tst         R2, #4
304         strne       R1, [R0], #4
305         and         R2, R2, #3
306 
307 ARM_MEMSET8_loop3:
308         ; Copy up to 3 remaining bytes
309         subs        R2, R2, #1
310         strgeb      R1, [R0], #1
311         subs        R2, R2, #1
312         strgeb      R1, [R0], #1
313         subs        R2, R2, #1
314         strgeb      R1, [R0], #1
315         ldmia       SP!, {R4, R5}
316         bx          LR
317 
318 ; int ARM_MEMSET16(void* pDest, U32 c, U32 NumHalfWords);
319 ;-------------------------------------------------------------------------------
320 ARM_MEMSET16:
321 ;-------------------------------------------------------------------------------
322         stmdb       SP!, {R4, R5}
323 
324         cmp         R2, #2
325         blt         ARM_MEMSET16_HandleTrailingHalfWord    ; 1 or 0
326 
327         ; Alignment is known to be at least 16-bit
328         tst         R0, #2
329         strneh      R1, [R0], #2              ; xxxx-xx10 --->
330         subne       R2, R2, #1                ; xxxx-xx00
331 
332         ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
333         orr         R1, R1, R1, LSL #16
334         mov         R4, R1
335 
336         cmp         R2, #8
337         blt         ARM_MEMSET16_HandleTrailingWords       ; 7, 6, ... 0
338 
339         tst         R0, #4
340         strne       R1, [R0], #4              ; xxxx-x100 --->
341         subne       R2, R2, #2                ; xxxx-x000 --->
342 
343         ; Now we are 64-bit aligned
344         tst         R0, #8
345         stmneia     R0!, {R1, R4}             ; xxxx-1000 --->
346         subne       R2, R2, #4                ; xxxx-0000 --->
347 
348 ARM_MEMSET16_HandleBulkWordData:
349         ; Now we are 128-bit aligned
350         mov         R5, R1
351         mov         R3, R1
352 
353 ARM_MEMSET16_LoopHandleBulkWord:
354         ; Copy 4 32-bit values per loop iteration
355         subs        R2, R2, #8
356         stmgeia     R0!, {R1, R3, R4, R5}
357         bge         ARM_MEMSET16_LoopHandleBulkWord
358         add         R2, R2, #8
359 
360 ARM_MEMSET16_HandleTrailingWords:
361         ; Copy up to 3 remaining 32-bit values
362         tst         R2, #4
363         stmneia     R0!, {R1, R4}
364 
365         tst         R2, #2
366         strne       R1, [R0], #4
367 
368         and         R2, R2, #1
369 
370 ARM_MEMSET16_HandleTrailingHalfWord:
371         ; Copy up to 1 remaining 16-bit value
372         subs        R2, R2, #1
373         strgeh      R1, [R0], #2
374 
375         ldmia       SP!, {R4, R5}
376         bx          LR
377 
378 
379 ; int ARM_MEMSET32(void* pDest, U32 c, U32 NumWords);
380 ;-------------------------------------------------------------------------------
381 ARM_MEMSET32:
382 ;-------------------------------------------------------------------------------
383         stmdb       SP!, {R4, R5}
384 
385         cmp         R2, #4
386         blt         ARM_MEMSET32_loop2
387 
388         ; Alignment is known to be at least 32-bit
389         mov         R3, R1
390 
391         tst         R0, #4
392         strne       R1, [R0], #4
393         subne       R2, R2, #1
394 
395         ; Now we are 64-bit aligned
396         tst         R0, #8
397         stmneia     R0!, {R1, R3}
398         subne       R2, R2, #2
399 
400         ; Now we are 128-bit aligned
401         mov         R4, R1
402         mov         R5, R1
403 ARM_MEMSET32_loop1:
404         ; Copy 4 32-bit values per loop iteration
405         subs        R2, R2, #4
406         stmgeia     R0!, {R1, R3, R4, R5}
407         bge         ARM_MEMSET32_loop1
408         add         R2, R2, #4
409 
410 ARM_MEMSET32_loop2:
411         ; Copy up to 3 remaining 32-bit values
412         subs        R2, R2, #1
413         strge       R1, [R0], #4
414         subs        R2, R2, #1
415         strge       R1, [R0], #4
416         subs        R2, R2, #1
417         strge       R1, [R0], #4
418 
419         ldmia       SP!, {R4, R5}
420         bx          LR
421 
422 ;-__arm void ARM_memxor(void* pDest, U32 c, U32 NumBytes);
423 ;                           r0         r1     r2
424 ;-------------------------------------------------------------------------------
425 arm_memxor:
426 ;-------------------------------------------------------------------------------
427         orr         R1, R1, R1, LSL #+8
428         orr         R1, R1, R1, LSL #+16
429 
430         cmp         R2, #+3                     ; R2 = NumBytes
431         bls         arm_memxor_HandleTrailingBytes        ; If we have less than one complete word, use single byte transfer
432 
433         ands        R3, R0, #+3                 ; R0 = destination address
434         beq         arm_memxor_DestIsAligned              ; Is destination address already word aligned ?
435 
436 ;-
437 ; Handle as much bytes as necessary to align destination address
438 ;-
439         ldrb        R12, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
440         eor         R12, R12, r1
441         strb        R12, [R0], #+1              ; We need at least one byte to the next word alignment, so we read one.
442 
443         cmp         R3, #+2                    ; Set condition codes according to the mis-alignment
444         add         R2, R2, R3                 ; Adjust NumBytes
445 
446         ldrbls      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
447         eorls       R3, R3, r1
448         strbls      R3, [R0], #+1             ; Lower or same (LS)? -> We need one or two bytes to the next word aligned address
449 
450         sub         R2, R2, #+4                ; Adjust NumBytes
451 
452         ldrbcc      R3, [R0], #+0              ; We need at least one byte to the next word alignment, so we read one.
453         eorcc       R3, R3, r1
454         strbcc      R3, [R0], #+1              ; Carry clear (CC)? -> We need one more byte
455 
456 ;-
457 ; Choose best way to transfer data
458 ;-
459 arm_memxor_DestIsAligned:                                  ; destination is aligned, use bulk word transfer
460 ;-
461 ; Handle large bulk data in blocks of 8 words (32 bytes)
462 ;-
463 arm_memxor_HandleBulkWordData:
464         stmdb       SP!, {R4, R5, R6, R7}
465 
466         subs        R2, R2, #+0x20             ; 32 Bytes = 8 DWords
467         bcc         arm_memxor_HandleTrailingWords
468 
469 arm_memxor_LoopHandleBulkWord:
470         ldm         R0,  {R3, R4, R5, R6}
471         eor         r3, r3, r1
472         eor         r4, r4, r1
473         eor         r5, r5, r1
474         eor         r6, r6, r1
475         stm         R0!, {R3, R4, R5, R6}
476 
477         ldm         R0,  {R3, R4, R5, R6}
478         eor         r3, r3, r1
479         eor         r4, r4, r1
480         eor         r5, r5, r1
481         eor         r6, r6, r1
482         stm         R0!, {R3, R4, R5, R6}
483 
484         subs        R2, R2, #+0x20
485         bcs         arm_memxor_LoopHandleBulkWord
486 
487 ;-
488 ; Handle trailing 7 words
489 ;-
490 arm_memxor_HandleTrailingWords:
491         movs        R7, R2, LSL #28             ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
492 
493         ldmcs       R0,  {R3, R4, R5, R6}
494         eorcs       r3, r3, r1
495         eorcs       r4, r4, r1
496         eorcs       r5, r5, r1
497         eorcs       r6, r6, r1
498         stmcs       R0!, {R3, R4, R5, R6}       ; C flag contain bit 4 of NumBytes (transfer 16 bytes if it is xor)
499 
500         ldmmi       R0,  {R3, R4}
501         eormi       r3, r3, r1
502         eormi       r4, r4, r1
503         stmmi       R0!, {R3, R4}                ; N flag contain bit 3 of NumBytes (transfer 8 bytes if it is xor)
504 
505         movs        R7, R2, LSL #+30            ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
506 
507         ldrcs       R3, [R0]
508         eorcs       r3, r3, r1
509         strcs       R3, [R0], #+4              ; C flag contain bit 2 of NumBytes (transfer 4 bytes if it is xor)
510 
511         ldmia       SP!, {R4, R5, R6, R7}
512         bxeq        LR                          ; Z flag contain no Trailing Bytes
513 
514 ;-
515 ; Handle trailing 3 bytes
516 ;-
517 arm_memxor_HandleTrailingBytes:
518         movs        R2, R2, LSL #+31           ; Shift NumBytes left to use C and N flag of CPSR to conditional load/store data
519 
520         ldrmi       R2, [R0]
521         eormi       R2, R2, r1
522         strbmi      R2, [R0], #+1              ; N flag contain bit 0 of NumBytes (transfer 1 byte if it is xor)
523 
524         ldrcs       R2, [R0]
525         eorcs       R2, R2, r1
526         strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)
527 
528         ldrcs       R2, [R0]
529         eorcs       R2, R2, r1
530         strbcs      R2, [R0], #+1              ; C flag contain bit 1 of NumBytes (transfer 2 bytes if it is xor)
531 
532         bx          LR
533 
534 ;-__arm int arm_memxor8(void* pDest, U32 c, U32 NumBytes);
535 ;                           r0         r1     r2
536 ;-------------------------------------------------------------------------------
537 arm_memxor8:
538 ;-------------------------------------------------------------------------------
539         stmdb       SP!, {R4, R5, R6}
540 
541         orr         R1, R1, R1, LSL #+8
542         orr         R1, R1, R1, LSL #+16
543 
544         cmp         R2, #4
545         blt         arm_memxor8_loop3
546 
547         ; Alignment is unknown
548         tst         R0, #1
549 
550         ldrneb      R6, [R0]
551         eorne       R6, r6, R1
552         strneb      R6, [R0], #1
553 
554         subne       R2, R2, #1
555 
556         ; Now we are 16-bit aligned (need to upgrade 'c' to 16-bit)
557         tst         R0, #2
558 
559         ldrneh      R6, [R0]
560         eorne       R6, r6, R1
561         strneh      R6, [R0], #2
562 
563         subne       R2, R2, #2
564 
565         ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
566         cmp         R2, #16
567         blt         arm_memxor8_loop2
568         tst         R0, #4
569 
570         ldrne       R6, [R0]
571         eorne       R6, r6, R1
572         strne       R6, [R0], #4
573         ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit)
574         subne       R2, R2, #4
575         tst         R0, #8
576 
577         ldmneia     R0, {R3, R6}
578         eorne       R3, r3, R1
579         eorne       R6, r6, R1
580         stmneia     R0!, {R3, R6}
581 
582         subne       R2, R2, #8
583 
584         ; Now we are 128-bit aligned
585         mov         R4, R1
586         mov         R5, R1
587 arm_memxor8_loop1:
588         ; Copy 4 32-bit values per loop iteration
589         subs        R2, R2, #16
590 
591         ldmgeia     R0,  {R3, R4, R5, R6}
592         eorge       r3, r3, r1
593         eorge       r4, r4, r1
594         eorge       r5, r5, r1
595         eorge       r6, r6, r1
596         stmgeia     R0!, {R3, R4, R5, R6}
597 
598         bge         arm_memxor8_loop1
599         add         R2, R2, #16
600 
601 arm_memxor8_loop2:
602         ; Copy up to 3 remaining 32-bit values
603         tst         R2, #8
604 
605         ldmneia     R0, {R3, R4}
606         eorne       r3, r3, r1
607         eorne       r4, r4, r1
608         stmneia     R0!, {R3, R4}
609 
610         tst         R2, #4
611 
612         ldrne       R3, [R0]
613         eorne       r3, r3, r1
614         strne       R3, [R0], #4
615 
616         and         R2, R2, #3
617 
618 arm_memxor8_loop3:
619         ; Copy up to 3 remaining bytes
620         subs        R2, R2, #1
621 
622         ldrgeb      R3, [R0]
623         eorge       r3, r3, r1
624         strgeb      R3, [R0], #1
625 
626         subs        R2, R2, #1
627 
628         ldrgeb      R3, [R0]
629         eorge       r3, r3, r1
630         strgeb      R1, [R0], #1
631 
632         subs        R2, R2, #1
633 
634         ldrgeb      R3, [R0]
635         eorge       r3, r3, r1
636         strgeb      R1, [R0], #1
637 
638         ldmia       SP!, {R4, R5, R6}
639         bx          LR
640 
641 ;-__arm int arm_memxor16(void* pDest, U32 c, U32 NumHalfWords);
642 ;                           r0         r1     r2
643 ;-------------------------------------------------------------------------------
644 arm_memxor16:
645 ;-------------------------------------------------------------------------------
646         stmdb       SP!, {R4, R5, R6}
647         orr         R1, R1, R1, LSL #+16
648 
649         cmp         R2, #2
650         blt         arm_memxor16_HandleTrailingHalfWord    ; 1 or 0
651 
652         ; Alignment is known to be at least 16-bit
653         tst         R0, #2
654 
655         ldrneh      R6, [R0]
656         eorne       R6, r6, R1
657         strneh      R6, [R0], #2              ; xxxx-xx10 --->
658 
659         subne       R2, R2, #1                ; xxxx-xx00
660 
661         ; Now we are 32-bit aligned (need to upgrade 'c' to 32-bit )
662         cmp         R2, #8
663         blt         arm_memxor16_HandleTrailingWords       ; 7, 6, ... 0
664 
665         tst         R0, #4
666 
667         ldrne       R3, [R0]
668         eorne       r3, r3, r1
669         strne       R3, [R0], #4              ; xxxx-x100 --->
670 
671 
672         subne       R2, R2, #2                ; xxxx-x000 --->
673 
674         ; Now we are 64-bit aligned
675         tst         R0, #8
676 
677         ldmneia     R0, {R3, R4}
678         eorne       r3, r3, r1
679         eorne       r4, r4, r1
680         stmneia     R0!, {R3, R4}             ; xxxx-1000 --->
681 
682         subne       R2, R2, #4                ; xxxx-0000 --->
683 
684 arm_memxor16_HandleBulkWordData:
685         ; Now we are 128-bit aligned
686         mov         R5, R1
687         mov         R6, R1
688 
689 arm_memxor16_LoopHandleBulkWord:
690         ; Copy 4 32-bit values per loop iteration
691         subs        R2, R2, #8
692 
693         ldmgeia     R0,  {R3, R4, R5, R6}
694         eorge       r3, r3, r1
695         eorge       r4, r4, r1
696         eorge       r5, r5, r1
697         eorge       r6, r6, r1
698         stmgeia     R0!, {R3, R4, R5, R6}
699 
700         bge         arm_memxor16_LoopHandleBulkWord
701         add         R2, R2, #8
702 
703 arm_memxor16_HandleTrailingWords:
704         ; Copy up to 3 remaining 32-bit values
705         tst         R2, #4
706 
707         ldmneia     R0, {R3, R4}
708         eorne       r3, r3, r1
709         eorne       r4, r4, r1
710         stmneia     R0!, {R3, R4}
711 
712         tst         R2, #2
713 
714         ldrne       R3, [R0]
715         eorne       r3, r3, r1
716         strne       R3, [R0], #4
717 
718         and         R2, R2, #1
719 
720 arm_memxor16_HandleTrailingHalfWord:
721         ; Copy up to 1 remaining 16-bit value
722         subs        R2, R2, #1
723 
724         ldrgeh      R3, [R0]
725         eorge       r3, r3, r1
726         strgeh      R3, [R0], #2
727 
728         ldmia       SP!, {R4, R5, R6}
729         bx          LR
730 
731 
732 ;-__arm int arm_memxor32(void* pDest, U32 c, U32 NumWords);
733 ;                           r0         r1     r2
734 ;-------------------------------------------------------------------------------
735 arm_memxor32:
736 ;-------------------------------------------------------------------------------
737         stmdb       SP!, {R4, R5, R6}
738 
739         cmp         R2, #4
740         blt         arm_memxor32_loop2
741 
742         ; Alignment is known to be at least 32-bit, is it 64-bit aligned ?
743         tst         R0, #4
744         ; No, it is 32-bit aligned
745         ldrne       R3, [R0]
746         eorne       R3, r3, R1
747         strne       R3, [R0], #4
748         subne       R2, R2, #1
749 
750         ; Now we are 64-bit aligned, is it 128-bit aligned ?
751         tst         R0, #8
752         ; No, it is 64-bit aligned
753         ldmneia     R0, {R3, R4}
754         eorne       r3, r3, r1
755         eorne       r4, r4, r1
756         stmneia     R0!, {R3, R4}             ; xxxx-1000 --->
757         subne       R2, R2, #2
758 
759         ; Now we are 128-bit aligned
760         mov         R4, R1
761         mov         R5, R1
762 arm_memxor32_loop1:
763         ; Copy 4 32-bit values per loop iteration
764         subs        R2, R2, #4
765 
766         ldmgeia     R0,  {R3, R4, R5, R6}
767         eorge       r3, r3, r1
768         eorge       r4, r4, r1
769         eorge       r5, r5, r1
770         eorge       r6, r6, r1
771         stmgeia     R0!, {R3, R4, R5, R6}
772 
773         bge         arm_memxor32_loop1
774         add         R2, R2, #4
775 
776 arm_memxor32_loop2:
777         ; Copy up to 3 remaining 32-bit values
778 
779         subs        R2, R2, #1
780         ldrge       R3, [R0]
781         eorge       r3, r3, r1
782         strge       R3, [R0], #4
783 
784         subs        R2, R2, #1
785         ldrge       R3, [R0]
786         eorge       r3, r3, r1
787         strge       R3, [R0], #4
788 
789         subs        R2, R2, #1
790         ldrge       R3, [R0]
791         eorge       r3, r3, r1
792         strge       R3, [R0], #4
793 
794         ldmia       SP!, {R4, R5, R6}
795         bx          LR
796 
797 
798         END

 

posted @ 2015-07-16 14:55  IAmAProgrammer  阅读(1255)  评论(0编辑  收藏  举报