Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
argo
aml
Commits
b808beb2
Commit
b808beb2
authored
Aug 06, 2018
by
Kamil Iskra
Browse files
[feature/fix] Duplicate dgemm_nofetch.c changes
Duplicate changes to dgemm_nofetch.c from commits
6a0d1cbd
and
9326c388
.
parent
9326c388
Pipeline
#6955
passed with stage
in 4 minutes and 4 seconds
Changes
1
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
benchmarks/dgemm_prefetch.c
View file @
b808beb2
...
@@ -27,12 +27,10 @@ void do_work()
...
@@ -27,12 +27,10 @@ void do_work()
ldb
=
lda
;
ldb
=
lda
;
ldc
=
lda
;
ldc
=
lda
;
size_t
ndims
[
2
];
size_t
ndims
[
2
];
double
*
ap
,
*
bp
,
*
cp
;
double
*
prea
,
*
preb
;
double
*
prea
,
*
preb
;
int
ai
,
bi
,
oldai
,
oldbi
;
int
ai
,
bi
,
oldai
,
oldbi
;
void
*
abaseptr
,
*
bbaseptr
;
void
*
abaseptr
,
*
bbaseptr
;
struct
aml_scratch_request
*
ar
,
*
br
;
struct
aml_scratch_request
*
ar
,
*
br
;
size_t
coff
;
aml_tiling_ndims
(
&
tiling_row
,
&
ndims
[
0
],
&
ndims
[
1
]);
aml_tiling_ndims
(
&
tiling_row
,
&
ndims
[
0
],
&
ndims
[
1
]);
abaseptr
=
aml_scratch_baseptr
(
&
sa
);
abaseptr
=
aml_scratch_baseptr
(
&
sa
);
bbaseptr
=
aml_scratch_baseptr
(
&
sb
);
bbaseptr
=
aml_scratch_baseptr
(
&
sb
);
...
@@ -51,6 +49,8 @@ void do_work()
...
@@ -51,6 +49,8 @@ void do_work()
{
{
for
(
int
j
=
0
;
j
<
ndims
[
1
];
j
++
)
for
(
int
j
=
0
;
j
<
ndims
[
1
];
j
++
)
{
{
size_t
coff
;
double
*
ap
,
*
bp
,
*
cp
;
ap
=
aml_tiling_tilestart
(
&
tiling_row
,
prea
,
i
);
ap
=
aml_tiling_tilestart
(
&
tiling_row
,
prea
,
i
);
bp
=
aml_tiling_tilestart
(
&
tiling_row
,
preb
,
j
);
bp
=
aml_tiling_tilestart
(
&
tiling_row
,
preb
,
j
);
coff
=
i
*
ndims
[
1
]
+
j
;
coff
=
i
*
ndims
[
1
]
+
j
;
...
@@ -110,11 +110,40 @@ int main(int argc, char* argv[])
...
@@ -110,11 +110,40 @@ int main(int argc, char* argv[])
b
=
aml_area_malloc
(
&
slow
,
memsize
);
b
=
aml_area_malloc
(
&
slow
,
memsize
);
c
=
aml_area_malloc
(
&
fast
,
memsize
);
c
=
aml_area_malloc
(
&
fast
,
memsize
);
assert
(
a
!=
NULL
&&
b
!=
NULL
&&
c
!=
NULL
);
assert
(
a
!=
NULL
&&
b
!=
NULL
&&
c
!=
NULL
);
for
(
unsigned
long
i
=
0
;
i
<
N
*
N
;
i
++
)
{
a
[
i
]
=
(
double
)
rand
();
size_t
ntilerows
,
ntilecols
,
tilerowsize
,
tilecolsize
,
rowsize
,
colsize
;
b
[
i
]
=
(
double
)
rand
();
rowsize
=
colsize
=
N
;
c
[
i
]
=
0
.
0
;
tilerowsize
=
tilecolsize
=
T
;
ntilerows
=
ntilecols
=
N
/
T
;
for
(
unsigned
long
i
=
0
;
i
<
N
*
N
;
i
+=
tilerowsize
)
{
size_t
tilerow
,
tilecol
,
row
,
column
;
/* Tile row index (row-major). */
tilerow
=
i
/
(
tilerowsize
*
tilecolsize
*
ntilerows
);
/* Tile column index (row-major). */
tilecol
=
(
i
/
tilerowsize
)
%
ntilerows
;
/* Row index within a tile (row-major). */
row
=
(
i
/
rowsize
)
%
tilecolsize
;
/* Column index within a tile (row-major). */
/* column = i % tilerowsize; */
size_t
a_offset
,
b_offset
;
/* Tiles in A need to be transposed (column-major). */
a_offset
=
(
tilecol
*
ntilecols
+
tilerow
)
*
tilerowsize
*
tilecolsize
+
row
*
tilerowsize
;
/* Tiles in B are in row-major order. */
b_offset
=
(
tilerow
*
ntilerows
+
tilecol
)
*
tilerowsize
*
tilecolsize
+
row
*
tilerowsize
;
for
(
column
=
0
;
column
<
tilerowsize
;
column
++
)
{
a
[
a_offset
+
column
]
=
(
double
)
rand
();
b
[
b_offset
+
column
]
=
(
double
)
rand
();
/* C is tiled as well (row-major) but since it's
all-zeros at this point, we don't bother. */
c
[
i
+
column
]
=
0
.
0
;
}
}
}
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
do_work
();
do_work
();
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
...
@@ -122,6 +151,24 @@ int main(int argc, char* argv[])
...
@@ -122,6 +151,24 @@ int main(int argc, char* argv[])
time
=
(
stop
.
tv_nsec
-
start
.
tv_nsec
)
+
time
=
(
stop
.
tv_nsec
-
start
.
tv_nsec
)
+
1e9
*
(
stop
.
tv_sec
-
start
.
tv_sec
);
1e9
*
(
stop
.
tv_sec
-
start
.
tv_sec
);
double
flops
=
(
2
.
0
*
N
*
N
*
N
)
/
(
time
/
1e9
);
double
flops
=
(
2
.
0
*
N
*
N
*
N
)
/
(
time
/
1e9
);
/* De-tile the result matrix (C). I couldn't figure out how to do
it in-place so we are de-tiling to the A matrix. */
for
(
unsigned
long
i
=
0
;
i
<
N
*
N
;
i
+=
tilerowsize
)
{
size_t
tilerow
,
tilecol
,
row
;
/* Tile row index (row-major). */
tilerow
=
i
/
(
tilerowsize
*
tilecolsize
*
ntilerows
);
/* Tile column index (row-major). */
tilecol
=
(
i
/
tilerowsize
)
%
ntilerows
;
/* Row index within a tile (row-major). */
row
=
(
i
/
rowsize
)
%
tilecolsize
;
/* i converted to tiled. */
unsigned
long
tiledi
=
(
tilerow
*
ntilerows
+
tilecol
)
*
tilerowsize
*
tilecolsize
+
row
*
tilerowsize
;
memcpy
(
&
a
[
i
],
&
c
[
tiledi
],
tilerowsize
*
sizeof
(
double
));
}
/* print the flops in GFLOPS */
/* print the flops in GFLOPS */
printf
(
"dgemm-prefetch: %llu %lld %lld %f
\n
"
,
N
,
memsize
,
time
,
printf
(
"dgemm-prefetch: %llu %lld %lld %f
\n
"
,
N
,
memsize
,
time
,
flops
/
1e9
);
flops
/
1e9
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment