1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <assert.h>
#define NUM (1<<20)
#define ITERS 100
/* From wikipedia on RDTSC */
__inline__ uint64_t rdtsc() {
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
typedef int (*test_func)(const char *s1, const char *s2, int n);
void run_test(const char *buf1, const char *buf2,
const char *name, test_func func) {
uint64_t start, end;
uint64_t accum = 0;
int i, x;
for (i = 0; i < ITERS; i++) {
start = rdtsc();
x = func(buf1, buf2, NUM);
end = rdtsc();
accum += end - start;
assert(x == NUM - 1);
}
accum /= ITERS;
printf("%s : %qu cycles\n", name, accum);
}
/* Build w/ -fno-builtin for this to be fast, this assumes that there
* is a difference at s1[n-1] */
int memcmp_fake(const char *s1, const char *s2, int n) {
int x = memcmp(s1, s2, n);
return x < 0 ? n - 1 : n + 1;
}
#define UNALIGNED_OK 1
static inline int
test2(const char *s1c, const char *s2c, int n)
{
int i = 0;
#if UNALIGNED_OK
int nint = n / sizeof(int);
if (nint >> 3)
{
int j = 0;
const int *s1 = (const int*)s1c;
const int *s2 = (const int*)s2c;
int nint_8 = nint - 8;
while (i <= nint_8 &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++] &&
s1[i++] == s2[j++]) { }
i = (i - 1) * sizeof(int);
}
#endif
while (i < n && s1c[i] == s2c[i])
{
i++;
}
return i;
}
static inline int
test1(const char *s1c, const char *s2c, int n) {
int i = 0;
while (i < n && s1c[i] == s2c[i])
{
i++;
}
return i;
}
int main(/*int argc, char **argv*/) {
char *buf1 = malloc(NUM+1);
char *buf2 = malloc(NUM+1);
int i;
for (i = 0; i < NUM; i++) {
buf1[i] = buf2[i] = rand();
}
buf2[NUM-1]++;
printf ("ALIGNED\n");
run_test(buf1, buf2, "memcmp", &memcmp_fake);
run_test(buf1, buf2, "test1", &test1);
run_test(buf1, buf2, "test2", &test2);
for (i = 0; i < NUM; i++) {
buf1[i] = buf2[i+1] = rand();
}
buf2[NUM]++;
printf ("UNALIGNED\n");
run_test(buf1, buf2+1, "memcmp", &memcmp_fake);
run_test(buf1, buf2+1, "test1", &test1);
run_test(buf1, buf2+1, "test2", &test2);
return 0;
}
|