-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsdriver.yo
More file actions
277 lines (277 loc) · 15.5 KB
/
sdriver.yo
File metadata and controls
277 lines (277 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
| #######################################################################
| # Test for copying block of size 4;
| #######################################################################
0x000: | .pos 0
0x000: 30f48004000000000000 | main: irmovq Stack, %rsp # Set up stack pointer
|
| # Set up arguments for copy function and then invoke it
0x00a: 30f20400000000000000 | irmovq $4, %rdx # src and dst have 4 elements
0x014: 30f6d803000000000000 | irmovq dest, %rsi # dst array
0x01e: 30f7a803000000000000 | irmovq src, %rdi # src array
0x028: 803200000000000000 | call ncopy
0x031: 00 | halt # should halt with num nonzeros in %rax
0x032: | StartFun:
| # Author:李一凡 ID=2100012520@stu.pku.edu.cn
| # 优化思路1:将原来的“先把常数移到寄存器中再做运算”的方法改成使用iaddq实现,减少冗余;(13.70)
| # 优化思路2:将Always Taken的分支预测方法改成BTFNT, 提高预测准确性。不过由于现在程序中分支并不多,所以这条方法没有很大改进;(13.63)
| # 优化点:Y86-64处理器默认寄存器存储的是0,所以一开始的那个异或是没必要的;(13.56)
| # 优化点:在mrmovq和rmmovq之间插入无关痛痒的%rdi+=8,因为后一个mov在前一个访存之后才能接收到数据转发,而这造成了一个阶段的暂停。把第一遍改漏的subq也换成iaddq;(11.56)
| # 优化点:每次对len减一的时候总是andq判断一下再跳转,实际上跳转之前只要没有其他运算就可以了;(10.56)
| # 优化思路3:按len-=2展开循环。一开始先把len-2,判断有没有资格进入设计出来的Loop_double。每次从Loop_double出来都要先-2,判断有没有再次进入Loop_double的资格。
| # 每次发现没有进入的资格就直接跳到Once 里面。由于%rdx比2小但是每次只是减去2,所以这个部分会判断%rdx是0(直接返回到Done)还是1(按照Once中的语句再ncopy一次);(8.73)
| # 优化思路4:按len-=3展开循环。与思路3类似,只是在最后跳出的判断出有差异。我选择先把由于小于3而不能进入下一次Loop_trible的%rdx先加上2,这样原来的0是负的,1是equal,2是正的。然后跳转到各自的处理模块中即可;(8.12)
| # 优化思路5:按len-=4展开循环。与上述过程类似,在第一个mrmov的地方利用bubble多复制一个,之后滚雪球。最后判断余数的时候,先加上3,再看大于或小于。等于只用执行一次。如果执行3次,先把最后一个复制过去,剩下两个与复制两次的用同一片代码,便于之后展开规模更大时不超字节限制;(7.90)
| # 优化思路6:按len-=6展开循环。处理余数的时候没什么技巧,只是加了一些数然后分成-1,0和从1开始的正数。如果不能完全分开就减一些数,仍然把他们分成-1,0,,一些正数......
| # 目前ncopy.yo的字节数637, 最后处理余数的环节出现了不少bubble;(7.76)
| # 优化思路7:按len-=8展开,与之前相似,不再赘述。由于没有找到合适的方法处理最后余数部分的的bubble,对于前面数据规模较小的测试来说反而是负优化;(7.75)
| # 优化思路8:利用书后题4.57提到的加载转发,合理优化mrmov和rmmov连起来的余数处理的部分;(7.63) (同样处理但是AT分支预测CPE=7.71)
|
|
| #/* $begin ncopy-ys */
| ##################################################################
| # ncopy.ys - Copy a src block of len words to dst.
| # Return the number of positive words (>0) contained in src.
| #
| # Include your name and ID here.
| #
| # Describe how and why you modified the baseline code.
| #
| ##################################################################
| # Do not modify this portion
| # Function prologue.
| # %rdi = src, %rsi = dst, %rdx = len
0x032: | ncopy:
|
| ##################################################################
| # You can modify this portion
| # Loop header
| # xorq %rax,%rax # count = 0;
|
| # 设法判断运行时环境:
0x032: 62cc | andq %r12, %r12
0x034: 734403000000000000 | je naive_ncopy
|
0x03d: c0f2f7ffffffffffffff | iaddq $-9, %rdx
0x047: 729701000000000000 | jl Last
|
|
0x050: | Loop_nine:
0x050: 50a70000000000000000 | mrmovq (%rdi), %r10 # read val from src...
0x05a: 50b70800000000000000 | mrmovq 8(%rdi), %r11 # Destory this bubble.
0x064: 40a60000000000000000 | rmmovq %r10, (%rsi) # ...and store it to dst
0x06e: 62aa | andq %r10, %r10 # val <= 0?
| # jle L1 # if so, goto L1 (Across iaddq):
0x070: c0f00100000000000000 | iaddq $1, %rax # irmovq $1, %r10
| # addq %r10, %rax # count++
0x07a: | L1:
0x07a: 50a71000000000000000 | mrmovq 16(%rdi), %r10
0x084: 40b60800000000000000 | rmmovq %r11, 8(%rsi)
0x08e: 62bb | andq %r11, %r11
| # jle L2
0x090: c0f00100000000000000 | iaddq $1, %rax
|
0x09a: | L2:
0x09a: 50b71800000000000000 | mrmovq 24(%rdi), %r11
0x0a4: 40a61000000000000000 | rmmovq %r10, 16(%rsi)
0x0ae: 62aa | andq %r10, %r10
| # jle L3
0x0b0: c0f00100000000000000 | iaddq $1, %rax
0x0ba: | L3:
0x0ba: 50a72000000000000000 | mrmovq 32(%rdi), %r10
0x0c4: 40b61800000000000000 | rmmovq %r11, 24(%rsi)
0x0ce: 62bb | andq %r11, %r11
| # jle L4
0x0d0: c0f00100000000000000 | iaddq $1, %rax
0x0da: | L4:
0x0da: 50b72800000000000000 | mrmovq 40(%rdi), %r11
0x0e4: 40a62000000000000000 | rmmovq %r10, 32(%rsi)
0x0ee: 62aa | andq %r10, %r10
| # jle L5
0x0f0: c0f00100000000000000 | iaddq $1, %rax
0x0fa: | L5:
0x0fa: 50a73000000000000000 | mrmovq 48(%rdi), %r10
0x104: 40b62800000000000000 | rmmovq %r11, 40(%rsi)
0x10e: 62bb | andq %r11, %r11
| # jle L6
0x110: c0f00100000000000000 | iaddq $1, %rax
0x11a: | L6:
0x11a: 50b73800000000000000 | mrmovq 56(%rdi), %r11
0x124: 40a63000000000000000 | rmmovq %r10, 48(%rsi)
0x12e: 62aa | andq %r10, %r10
| # jle L7
0x130: c0f00100000000000000 | iaddq $1, %rax
0x13a: | L7:
0x13a: 50a74000000000000000 | mrmovq 64(%rdi), %r10
0x144: 40b63800000000000000 | rmmovq %r11, 56(%rsi)
0x14e: 62bb | andq %r11, %r11
| # jle L8
0x150: c0f00100000000000000 | iaddq $1, %rax
0x15a: | L8:
0x15a: 40a64000000000000000 | rmmovq %r10, 64(%rsi)
0x164: 62aa | andq %r10, %r10
| # jle L9
0x166: c0f00100000000000000 | iaddq $1, %rax
|
0x170: | L9:
0x170: c0f74800000000000000 | iaddq $72, %rdi # src += 9
0x17a: c0f64800000000000000 | iaddq $72, %rsi # dst += 9
0x184: c0f2f7ffffffffffffff | iaddq $-9, %rdx
0x18e: 755000000000000000 | jge Loop_nine # if %rdx >= 9, goto Loop again:
|
0x197: | Last:
0x197: c0f20800000000000000 | iaddq $8, %rdx
0x1a1: 72a103000000000000 | jl Done
0x1aa: 76dd01000000000000 | jg Judge_remain # 对应着原来的2,3,4,5,6,7,8.
0x1b3: 50a70000000000000000 | mrmovq (%rdi), %r10
0x1bd: 40a60000000000000000 | rmmovq %r10, (%rsi)
0x1c7: 62aa | andq %r10, %r10
0x1c9: 71a103000000000000 | jle Done
0x1d2: c0f00100000000000000 | iaddq $1, %rax
0x1dc: 90 | ret
|
0x1dd: | Judge_remain:
0x1dd: c0f2feffffffffffffff | iaddq $-2, %rdx
0x1e7: 720303000000000000 | jl Loop_twice
0x1f0: 73e302000000000000 | je Loop_three
0x1f9: 760202000000000000 | jg Judge_four_to_eight
|
0x202: | Judge_four_to_eight:
0x202: c0f2feffffffffffffff | iaddq $-2, %rdx
0x20c: 72c302000000000000 | jl Loop_four
0x215: 73a302000000000000 | je Loop_five
0x21e: 762702000000000000 | jg Judge_six_eight
|
0x227: | Judge_six_eight:
0x227: c0f2feffffffffffffff | iaddq $-2, %rdx
0x231: 728302000000000000 | jl Loop_six
0x23a: 736302000000000000 | je Loop_seven
|
0x243: | Loop_eight:
0x243: 50a73800000000000000 | mrmovq 56(%rdi), %r10
0x24d: 40a63800000000000000 | rmmovq %r10, 56(%rsi)
0x257: 62aa | andq %r10, %r10
| # jle Loop_seven
0x259: c0f00100000000000000 | iaddq $1, %rax
|
0x263: | Loop_seven:
0x263: 50a73000000000000000 | mrmovq 48(%rdi), %r10
0x26d: 40a63000000000000000 | rmmovq %r10, 48(%rsi)
0x277: 62aa | andq %r10, %r10
| # jle Loop_six
0x279: c0f00100000000000000 | iaddq $1, %rax
|
0x283: | Loop_six:
0x283: 50a72800000000000000 | mrmovq 40(%rdi), %r10
0x28d: 40a62800000000000000 | rmmovq %r10, 40(%rsi)
0x297: 62aa | andq %r10, %r10
| # jle Loop_five
0x299: c0f00100000000000000 | iaddq $1, %rax
|
0x2a3: | Loop_five:
0x2a3: 50a72000000000000000 | mrmovq 32(%rdi), %r10
0x2ad: 40a62000000000000000 | rmmovq %r10, 32(%rsi)
0x2b7: 62aa | andq %r10, %r10
| # jle Loop_four
0x2b9: c0f00100000000000000 | iaddq $1, %rax
|
0x2c3: | Loop_four:
0x2c3: 50a71800000000000000 | mrmovq 24(%rdi), %r10
0x2cd: 40a61800000000000000 | rmmovq %r10, 24(%rsi)
0x2d7: 62aa | andq %r10, %r10
| # jle Loop_three
0x2d9: c0f00100000000000000 | iaddq $1, %rax
|
0x2e3: | Loop_three:
0x2e3: 50a71000000000000000 | mrmovq 16(%rdi), %r10
0x2ed: 40a61000000000000000 | rmmovq %r10, 16(%rsi)
0x2f7: 62aa | andq %r10, %r10
| # jle Loop_twice
0x2f9: c0f00100000000000000 | iaddq $1, %rax
|
|
0x303: | Loop_twice:
0x303: 50a70000000000000000 | mrmovq (%rdi), %r10
0x30d: 50b70800000000000000 | mrmovq 8(%rdi), %r11 # Destory the bubble.
0x317: 40a60000000000000000 | rmmovq %r10, (%rsi)
0x321: 62aa | andq %r10, %r10
| # jle Second
0x323: c0f00100000000000000 | iaddq $1, %rax
0x32d: | Second:
0x32d: 40b60800000000000000 | rmmovq %r11, 8(%rsi)
0x337: 62bb | andq %r11, %r11
| # jle Done
0x339: c0f00100000000000000 | iaddq $1, %rax
0x343: 90 | ret
|
0x344: | naive_ncopy:
0x344: 6222 | andq %rdx,%rdx # len <= 0?
0x346: 71a103000000000000 | jle Done # if so, goto Done:
|
0x34f: | Loop:
0x34f: 50a70000000000000000 | mrmovq (%rdi), %r10 # read val from src...
0x359: c0f70800000000000000 | iaddq $8, %rdi # src++
0x363: 40a60000000000000000 | rmmovq %r10, (%rsi) # ...and store it to dst
0x36d: 62aa | andq %r10, %r10 # val <= 0?
0x36f: 718203000000000000 | jle Npos # if so, goto Npos:
0x378: c0f00100000000000000 | iaddq $1, %rax # irmovq $1, %r10
| # addq %r10, %rax # count++
0x382: | Npos:
0x382: c0f2ffffffffffffffff | iaddq $-1, %rdx
0x38c: c0f60800000000000000 | iaddq $8, %rsi # dst++
0x396: 6222 | andq %rdx,%rdx # len > 0?
0x398: 764f03000000000000 | jg Loop # if so, goto Loop:
|
|
| ##################################################################
| # Do not modify the following section of code
| # Function epilogue.
0x3a1: | Done:
0x3a1: 90 | ret
| ##################################################################
| # Keep the following label at the end of your function (AND MAKE SURE THERE IS A BLANKE LINE IN THE END!!!)
0x3a2: | End:
| #/* $end ncopy-ys */
0x3a2: | EndFun:
|
| ###############################
| # Source and destination blocks
| ###############################
0x3a8: | .align 8
0x3a8: | src:
0x3a8: ffffffffffffffff | .quad -1
0x3b0: 0200000000000000 | .quad 2
0x3b8: 0300000000000000 | .quad 3
0x3c0: fcffffffffffffff | .quad -4
0x3c8: fadebc0000000000 | .quad 0xbcdefa # This shouldn't get moved
|
0x3d0: | .align 16
0x3d0: | Predest:
0x3d0: fadebc0000000000 | .quad 0xbcdefa
0x3d8: | dest:
0x3d8: abefcd0000000000 | .quad 0xcdefab
0x3e0: abefcd0000000000 | .quad 0xcdefab
0x3e8: abefcd0000000000 | .quad 0xcdefab
0x3f0: abefcd0000000000 | .quad 0xcdefab
0x3f8: | Postdest:
0x3f8: bcfade0000000000 | .quad 0xdefabc
|
0x400: | .align 8
| # Run time stack
0x400: 0000000000000000 | .quad 0
0x408: 0000000000000000 | .quad 0
0x410: 0000000000000000 | .quad 0
0x418: 0000000000000000 | .quad 0
0x420: 0000000000000000 | .quad 0
0x428: 0000000000000000 | .quad 0
0x430: 0000000000000000 | .quad 0
0x438: 0000000000000000 | .quad 0
0x440: 0000000000000000 | .quad 0
0x448: 0000000000000000 | .quad 0
0x450: 0000000000000000 | .quad 0
0x458: 0000000000000000 | .quad 0
0x460: 0000000000000000 | .quad 0
0x468: 0000000000000000 | .quad 0
0x470: 0000000000000000 | .quad 0
0x478: 0000000000000000 | .quad 0
|
0x480: | Stack: