CSAPP 之 Arch Lab

《深入理解计算机系统》之Architecture Lab。


前言

本次实验主要是对第四章处理器体系结构的测验,还有一部分第五章的内容。

Music

CSAPP 实验记录

本系列文章主要记录 CSAPP 3.0 的实验过程,所有实验记录文章请查看这儿

快速开始请访问 CSAPP Lab 官网,本次实验记录是基于 CSAPP 3.0,实验日期始于:2019-4-1

实验目标

实验分为三部分,第一部分很简单,就是简单地考察一下汇编;第二部分是在hcl文件中添加iaddq指令的逻辑;第三部分是修改hcl和ncopy汇编文件使内存元素复制的速度尽可能达到最快。

Part A

目标:使用Y86-64汇编程序实现以下三个函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* 
* Architecture Lab: Part A
*
* High level specs for the functions that the students will rewrite
* in Y86-64 assembly language
*/

/* $begin examples */
/* linked list element */
typedef struct ELE {
long val;
struct ELE *next;
} *list_ptr;

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
long val = 0;
while (ls) {
val += ls->val;
ls = ls->next;
}
return val;
}

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
if (!ls)
return 0;
else {
long val = ls->val;
long rest = rsum_list(ls->next);
return val + rest;
}
}

/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
long result = 0;
while (len > 0) {
long val = *src++;
*dest++ = val;
result ^= val;
len--;
}
return result;
}
/* $end examples */

Part B

在hcl文件中添加iaddq指令的逻辑。

Part C

修改hcl和ncopy汇编文件使内存元素复制的速度尽可能达到最快,即CPE(cycles per element)越来越小。

  • 修改hcl内容首先要添加iaddq的实现
  • 降低CPI,即处理ret、jmp预测、加载使用冒险。ret没必要处理。
  • 程序性能优化(3e第五章)

实验前的归纳

概括一下这次实验用到的知识点。

首先是汇编相关的内容,通过第三章的学习,part A直接上手完成是没有问题的。
其次是hcl逻辑块的实现,这就需要对SEQ以及PIPE的实现及ISA有一定的了解。

Arch Lab

正式开始记录实验,Part C由于第五章内容还没有了解完全,所以准备后期二刷Part C。

Part A

这一部分凭借汇编基础即可完成。不过还需要注意一下汇编的一些伪指令和格式问题。

  • sum_list
文件名:archlab/archlab-handout/sim/misc/sum.ys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# sum_list function coded by scarborough_coral

.pos 0
irmovq stack,%rsp
call main
halt

# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

sum_list:
xorq %rax,%rax

loop_start:
andq %rdi,%rdi
je loop_end

mrmovq 0(%rdi),%rsi
addq %rsi,%rax
mrmovq 8(%rdi),%rsi
rrmovq %rsi,%rdi

jmp loop_start

loop_end:
ret

main:
irmovq ele1,%rdi
call sum_list
ret


.pos 1024
stack:

  • rsum_list
文件名:archlab/archlab-handout/sim/misc/rsum.ys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
.pos 0
irmovq stack,%rsp
call main
halt

.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

rsum_list:
xorq %rax,%rax
andq %rdi,%rdi
jne not_null
ret
not_null:
mrmovq 0(%rdi),%rsi
pushq %rsi
mrmovq 8(%rdi),%rdi
call rsum_list
popq %rsi
addq %rsi,%rax
ret



main:
irmovq ele1,%rdi
call rsum_list
ret


.pos 1024
stack:

  • copy_block
文件名:archlab/archlab-handout/sim/misc/copy.ys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
.pos 0
irmovq stack,%rsp
call main
halt



.align 8
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00

dest:
.quad 0x111
.quad 0x222
.quad 0x333

copy_block:
xorq %rax,%rax
pushq %r12
pushq %r13
irmovq $1,%r13
irmovq $8,%r12
loop_start:
andq %rdx,%rdx
je loop_end
mrmovq (%rdi),%rcx
rmmovq %rcx,(%rsi)
addq %r12,%rdi
addq %r12,%rsi
xorq %rcx,%rax
subq %r13,%rdx
jmp loop_start
loop_end:
popq %r12
popq %r13
ret

main:
irmovq src,%rdi
irmovq dest,%rsi
irmovq $3,%rdx
call copy_block
ret


.pos 1024
stack:

Part B

Part B 只需要按照writeup查看3e课本iaddq指令的阶段实现即可实现。

  1. 可能会遇到找不到依赖库的问题,安装就行了
  2. 中途遇见了一个链接问题,是因为glibc版本太新,实验依赖的版本太过老旧导致一个matherr找不到问题,观察代码其他地方并没有用到,直接omit注释掉了。

修改内容如下图:

hcl代码如下:

文件名:archlab/archlab-handout/sim/seq/seq-full.hcl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#/* $begin seq-all-hcl */
####################################################################
# HCL Description of Control for Single Cycle Y86-64 Processor SEQ #
# Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2010 #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
# C Include's. Don't alter these #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'word_t gen_pc(){return 0;}'
quote 'int main(int argc, char *argv[])'
quote ' {plusmode=0;return sim_main(argc,argv);}'

####################################################################
# Declarations. Do not change/remove/delete any of these #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 'I_NOP'
wordsig IHALT 'I_HALT'
wordsig IRRMOVQ 'I_RRMOVQ'
wordsig IIRMOVQ 'I_IRMOVQ'
wordsig IRMMOVQ 'I_RMMOVQ'
wordsig IMRMOVQ 'I_MRMOVQ'
wordsig IOPQ 'I_ALU'
wordsig IJXX 'I_JMP'
wordsig ICALL 'I_CALL'
wordsig IRET 'I_RET'
wordsig IPUSHQ 'I_PUSHQ'
wordsig IPOPQ 'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ 'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes #####
wordsig FNONE 'F_NONE' # Default function code

##### Symbolic representation of Y86-64 Registers referenced explicitly #####
wordsig RRSP 'REG_RSP' # Stack Pointer
wordsig RNONE 'REG_NONE' # Special value indicating "no register"

##### ALU Functions referenced explicitly #####
wordsig ALUADD 'A_ADD' # ALU should add its arguments

##### Possible instruction status values #####
wordsig SAOK 'STAT_AOK' # Normal execution
wordsig SADR 'STAT_ADR' # Invalid memory address
wordsig SINS 'STAT_INS' # Invalid instruction
wordsig SHLT 'STAT_HLT' # Halt instruction encountered

##### Signals that can be referenced by control logic ####################

##### Fetch stage inputs #####
wordsig pc 'pc' # Program counter
##### Fetch stage computations #####
wordsig imem_icode 'imem_icode' # icode field from instruction memory
wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory
wordsig icode 'icode' # Instruction control code
wordsig ifun 'ifun' # Instruction function
wordsig rA 'ra' # rA field from instruction
wordsig rB 'rb' # rB field from instruction
wordsig valC 'valc' # Constant from instruction
wordsig valP 'valp' # Address of following instruction
boolsig imem_error 'imem_error' # Error signal from instruction memory
boolsig instr_valid 'instr_valid' # Is fetched instruction valid?

##### Decode stage computations #####
wordsig valA 'vala' # Value from register A port
wordsig valB 'valb' # Value from register B port

##### Execute stage computations #####
wordsig valE 'vale' # Value computed by ALU
boolsig Cnd 'cond' # Branch test

##### Memory stage computations #####
wordsig valM 'valm' # Value read from memory
boolsig dmem_error 'dmem_error' # Error signal from data memory


####################################################################
# Control Signal Definitions. #
####################################################################

################ Fetch Stage ###################################

# Determine instruction code
word icode = [
imem_error: INOP;
1: imem_icode; # Default: get from instruction memory
];

# Determine instruction function
word ifun = [
imem_error: FNONE;
1: imem_ifun; # Default: get from instruction memory
];

bool instr_valid = icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IIADDQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Does fetched instruction require a regid byte?
bool need_regids =
icode in { IRRMOVQ, IOPQ, IIADDQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

################ Decode Stage ###################################

## What register should be used as the A source?
word srcA = [
icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : rA;
icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word srcB = [
icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
icode in { IIRMOVQ, IOPQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word dstM = [
icode in { IMRMOVQ, IPOPQ } : rA;
1 : RNONE; # Don't write any register
];

################ Execute Stage ###################################

## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, IIADDQ, ICALL,
IPUSHQ, IRET, IPOPQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
icode == IOPQ : ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ, IIADDQ };

################ Memory Stage ###################################

## Set read control signal
bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL };

## Select memory address
word mem_addr = [
icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE;
icode in { IPOPQ, IRET } : valA;
# Other instructions don't need address
];

## Select memory input data
word mem_data = [
# Value from register
icode in { IRMMOVQ, IPUSHQ } : valA;
# Return PC
icode == ICALL : valP;
# Default: Don't write anything
];

## Determine instruction status
word Stat = [
imem_error || dmem_error : SADR;
!instr_valid: SINS;
icode == IHALT : SHLT;
1 : SAOK;
];

################ Program Counter Update ############################

## What address should instruction be fetched at

word new_pc = [
# Call. Use instruction constant
icode == ICALL : valC;
# Taken branch. Use instruction constant
icode == IJXX && Cnd : valC;
# Completion of RET instruction. Use value from stack
icode == IRET : valM;
# Default: Use incremented PC
1 : valP;
];
#/* $end seq-all-hcl */

然后按照writeup跑测试,通过。

Part C

Part C 的具体内容就是加速内存元素的拷贝问题。修改ncopy.yspipe-full.hcl代码使得CPE(cycles per element)尽可能小,即单个元素拷贝时间越少越好。

首先添加iaddq指令,原来是通过irmovqaddq来实现的,改为iaddq指令后起到很微弱的效果,并不能明显的减少CPE,如果你的irmovq指令在循环内部那就另说了,当然也不是最快。

具体代码如下:

文件名:archlab/archlab-handout/sim/pipe/pipe-full.hcl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#/* $begin pipe-all-hcl */
####################################################################
# HCL Description of Control for Pipelined Y86-64 Processor #
# Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2014 #
####################################################################

## Your task is to implement the iaddq instruction
## The file contains a declaration of the icodes
## for iaddq (IIADDQ)
## Your job is to add the rest of the logic to make it work

####################################################################
# C Include's. Don't alter these #
####################################################################

quote '#include <stdio.h>'
quote '#include "isa.h"'
quote '#include "pipeline.h"'
quote '#include "stages.h"'
quote '#include "sim.h"'
quote 'int sim_main(int argc, char *argv[]);'
quote 'int main(int argc, char *argv[]){return sim_main(argc,argv);}'

####################################################################
# Declarations. Do not change/remove/delete any of these #
####################################################################

##### Symbolic representation of Y86-64 Instruction Codes #############
wordsig INOP 'I_NOP'
wordsig IHALT 'I_HALT'
wordsig IRRMOVQ 'I_RRMOVQ'
wordsig IIRMOVQ 'I_IRMOVQ'
wordsig IRMMOVQ 'I_RMMOVQ'
wordsig IMRMOVQ 'I_MRMOVQ'
wordsig IOPQ 'I_ALU'
wordsig IJXX 'I_JMP'
wordsig ICALL 'I_CALL'
wordsig IRET 'I_RET'
wordsig IPUSHQ 'I_PUSHQ'
wordsig IPOPQ 'I_POPQ'
# Instruction code for iaddq instruction
wordsig IIADDQ 'I_IADDQ'

##### Symbolic represenations of Y86-64 function codes #####
wordsig FNONE 'F_NONE' # Default function code

##### Symbolic representation of Y86-64 Registers referenced #####
wordsig RRSP 'REG_RSP' # Stack Pointer
wordsig RNONE 'REG_NONE' # Special value indicating "no register"

##### ALU Functions referenced explicitly ##########################
wordsig ALUADD 'A_ADD' # ALU should add its arguments

##### Possible instruction status values #####
wordsig SBUB 'STAT_BUB' # Bubble in stage
wordsig SAOK 'STAT_AOK' # Normal execution
wordsig SADR 'STAT_ADR' # Invalid memory address
wordsig SINS 'STAT_INS' # Invalid instruction
wordsig SHLT 'STAT_HLT' # Halt instruction encountered

##### Signals that can be referenced by control logic ##############

##### Pipeline Register F ##########################################

wordsig F_predPC 'pc_curr->pc' # Predicted value of PC

##### Intermediate Values in Fetch Stage ###########################

wordsig imem_icode 'imem_icode' # icode field from instruction memory
wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory
wordsig f_icode 'if_id_next->icode' # (Possibly modified) instruction code
wordsig f_ifun 'if_id_next->ifun' # Fetched instruction function
wordsig f_valC 'if_id_next->valc' # Constant data of fetched instruction
wordsig f_valP 'if_id_next->valp' # Address of following instruction
boolsig imem_error 'imem_error' # Error signal from instruction memory
boolsig instr_valid 'instr_valid' # Is fetched instruction valid?

##### Pipeline Register D ##########################################
wordsig D_icode 'if_id_curr->icode' # Instruction code
wordsig D_rA 'if_id_curr->ra' # rA field from instruction
wordsig D_rB 'if_id_curr->rb' # rB field from instruction
wordsig D_valP 'if_id_curr->valp' # Incremented PC

##### Intermediate Values in Decode Stage #########################

wordsig d_srcA 'id_ex_next->srca' # srcA from decoded instruction
wordsig d_srcB 'id_ex_next->srcb' # srcB from decoded instruction
wordsig d_rvalA 'd_regvala' # valA read from register file
wordsig d_rvalB 'd_regvalb' # valB read from register file

##### Pipeline Register E ##########################################
wordsig E_icode 'id_ex_curr->icode' # Instruction code
wordsig E_ifun 'id_ex_curr->ifun' # Instruction function
wordsig E_valC 'id_ex_curr->valc' # Constant data
wordsig E_srcA 'id_ex_curr->srca' # Source A register ID
wordsig E_valA 'id_ex_curr->vala' # Source A value
wordsig E_srcB 'id_ex_curr->srcb' # Source B register ID
wordsig E_valB 'id_ex_curr->valb' # Source B value
wordsig E_dstE 'id_ex_curr->deste' # Destination E register ID
wordsig E_dstM 'id_ex_curr->destm' # Destination M register ID

##### Intermediate Values in Execute Stage #########################
wordsig e_valE 'ex_mem_next->vale' # valE generated by ALU
boolsig e_Cnd 'ex_mem_next->takebranch' # Does condition hold?
wordsig e_dstE 'ex_mem_next->deste' # dstE (possibly modified to be RNONE)

##### Pipeline Register M #########################
wordsig M_stat 'ex_mem_curr->status' # Instruction status
wordsig M_icode 'ex_mem_curr->icode' # Instruction code
wordsig M_ifun 'ex_mem_curr->ifun' # Instruction function
wordsig M_valA 'ex_mem_curr->vala' # Source A value
wordsig M_dstE 'ex_mem_curr->deste' # Destination E register ID
wordsig M_valE 'ex_mem_curr->vale' # ALU E value
wordsig M_dstM 'ex_mem_curr->destm' # Destination M register ID
boolsig M_Cnd 'ex_mem_curr->takebranch' # Condition flag
boolsig dmem_error 'dmem_error' # Error signal from instruction memory

##### Intermediate Values in Memory Stage ##########################
wordsig m_valM 'mem_wb_next->valm' # valM generated by memory
wordsig m_stat 'mem_wb_next->status' # stat (possibly modified to be SADR)

##### Pipeline Register W ##########################################
wordsig W_stat 'mem_wb_curr->status' # Instruction status
wordsig W_icode 'mem_wb_curr->icode' # Instruction code
wordsig W_dstE 'mem_wb_curr->deste' # Destination E register ID
wordsig W_valE 'mem_wb_curr->vale' # ALU E value
wordsig W_dstM 'mem_wb_curr->destm' # Destination M register ID
wordsig W_valM 'mem_wb_curr->valm' # Memory M value

####################################################################
# Control Signal Definitions. #
####################################################################

################ Fetch Stage ###################################

## What address should instruction be fetched at
word f_pc = [
# Mispredicted branch. Fetch at incremented PC
M_icode == IJXX && !M_Cnd : M_valA;
# Completion of RET instruction
W_icode == IRET : W_valM;
# Default: Use predicted value of PC
1 : F_predPC;
];

## Determine icode of fetched instruction
word f_icode = [
imem_error : INOP;
1: imem_icode;
];

# Determine ifun
word f_ifun = [
imem_error : FNONE;
1: imem_ifun;
];

# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IIADDQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };

# Determine status code for fetched instruction
word f_stat = [
imem_error: SADR;
!instr_valid : SINS;
f_icode == IHALT : SHLT;
1 : SAOK;
];

# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVQ, IOPQ, IIADDQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ };

# Does fetched instruction require a constant word?
bool need_valC =
f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

# Predict next value of PC
word f_predPC = [
f_icode in { IJXX, ICALL } : f_valC;
1 : f_valP;
];

################ Decode Stage ######################################


## What register should be used as the A source?
word d_srcA = [
D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA;
D_icode in { IPOPQ, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the B source?
word d_srcB = [
D_icode in { IOPQ, IIADDQ, IRMMOVQ, IMRMOVQ } : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

## What register should be used as the E destination?
word d_dstE = [
D_icode in { IRRMOVQ, IIRMOVQ, IOPQ, IIADDQ } : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

## What register should be used as the M destination?
word d_dstM = [
D_icode in { IMRMOVQ, IPOPQ } : D_rA;
1 : RNONE; # Don't write any register
];

## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
d_srcA == e_dstE : e_valE; # Forward valE from execute
d_srcA == M_dstM : m_valM; # Forward valM from memory
d_srcA == M_dstE : M_valE; # Forward valE from memory
d_srcA == W_dstM : W_valM; # Forward valM from write back
d_srcA == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalA; # Use value read from register file
];

word d_valB = [
d_srcB == e_dstE : e_valE; # Forward valE from execute
d_srcB == M_dstM : m_valM; # Forward valM from memory
d_srcB == M_dstE : M_valE; # Forward valE from memory
d_srcB == W_dstM : W_valM; # Forward valM from write back
d_srcB == W_dstE : W_valE; # Forward valE from write back
1 : d_rvalB; # Use value read from register file
];

################ Execute Stage #####################################

## Select input A to ALU
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : E_valC;
E_icode in { ICALL, IPUSHQ } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, IIADDQ, ICALL,
IPUSHQ, IRET, IPOPQ } : E_valB;
E_icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

## Set the ALU function
word alufun = [
E_icode == IOPQ : E_ifun;
1 : ALUADD;
];

## Should the condition codes be updated?
bool set_cc = (E_icode == IOPQ||E_icode == IIADDQ) &&
# State changes only during normal operation
!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };

## Generate valA in execute stage
word e_valA = E_valA; # Pass valA through stage

## Set dstE to RNONE in event of not-taken conditional move
word e_dstE = [
E_icode == IRRMOVQ && !e_Cnd : RNONE;
1 : E_dstE;
];

################ Memory Stage ######################################

## Select memory address
word mem_addr = [
M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
M_icode in { IPOPQ, IRET } : M_valA;
# Other instructions don't need address
];

## Set read control signal
bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };

## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };

#/* $begin pipe-m_stat-hcl */
## Update the status
word m_stat = [
dmem_error : SADR;
1 : M_stat;
];
#/* $end pipe-m_stat-hcl */

## Set E port register ID
word w_dstE = W_dstE;

## Set E port value
word w_valE = W_valE;

## Set M port register ID
word w_dstM = W_dstM;

## Set M port value
word w_valM = W_valM;

## Update processor status
word Stat = [
W_stat == SBUB : SAOK;
1 : W_stat;
];

################ Pipeline Register Control #########################

# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
bool F_bubble = 0;
bool F_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB } ||
# Stalling at fetch while ret passes through pipeline
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
bool D_stall =
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB };

bool D_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
IRET in { D_icode, E_icode, M_icode };

# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
bool E_stall = 0;
bool E_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB};

# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
bool M_stall = 0;
# Start injecting bubbles as soon as exception passes through memory stage
bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT };

# Should I stall or inject a bubble into Pipeline Register W?
bool W_stall = W_stat in { SADR, SINS, SHLT };
bool W_bubble = 0;
#/* $end pipe-all-hcl */

然后修改ncopy.ys文件

Version 1.0

文件名:archlab/archlab-handout/sim/pipe/ncopy.ys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop: mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
iaddq $8, %rdi # src++
iaddq $8, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

然而还是0分。

主要考虑还是要减少bubble,还有程序优化问题。

看完第五章再来二刷吧。

Version 2.0

结果:

所作修改:

  • 4x4循环展开,因为C=4,L=1
  • 消除了一些load/use指令组合
文件名:archlab/archlab-handout/sim/pipe/ncopy.ys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

testType:
irmovq $3,%r8
andq %rdx, %r8
je Loop
rrmovq %r8,%r11
irmovq $1, %rcx
originLoop:
mrmovq (%rdi), %r10
iaddq $8, %rdi # src++
andq %r10, %r10
rmmovq %r10, (%rsi)
jle neg
iaddq $1, %rax
neg:
iaddq $8, %rsi # dst++
subq %rcx, %r8
jg originLoop
subq %r11,%rdx # len > 0?
jle Done
Loop:
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi), %r11
mrmovq 16(%rdi), %r8
mrmovq 24(%rdi), %r9
rmmovq %r10, (%rsi) # ...and store it to dst
rmmovq %r11, 8(%rsi)
rmmovq %r8, 16(%rsi)
rmmovq %r9, 24(%rsi)
andq %r10, %r10 # val <= 0?
jle second # if so,go next item:
iaddq $1, %rax # count++
second:
andq %r11, %r11
jle third
iaddq $1, %rax
third:
andq %r8, %r8
jle forth
iaddq $1, %rax
forth:
andq %r9, %r9
jle Npos
iaddq $1, %rax
Npos:
iaddq $-4, %rdx # len--
iaddq $32, %rdi # src++
iaddq $32, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

总结

粗略的总结了一下。有问题请在评论区或者daovoice指正。

所感

  1. 最近做问题不够专注,时间观念不够强,可能是太过松散了
  2. 。。。

所得

  1. 从阅读《编码》这本书中浅浅的了解了处理器体系结构,到csapp3e进一步了解
  2. 熟悉了ISA设计的基本方法,指令分类、分阶段、流水线、冒险问题、异常处理、性能评价等

下一步

  1. 阅读第5章,二刷part C
  2. 使用番茄todo做好时间规划,并执行训练