1 /******************************************************************************/
2 /* Copyright (c) Tim LaBerge <tim.laberge@quantum.com>, 2009 */
3 /* */
4 /* This program is free software; you can redistribute it and/or modify */
5 /* it under the terms of the GNU General Public License as published by */
6 /* the Free Software Foundation; either version 2 of the License, or */
7 /* (at your option) any later version. */
8 /* */
9 /* This program is distributed in the hope that it will be useful, */
10 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
11 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See */
12 /* the GNU General Public License for more details. */
13 /* */
14 /* You should have received a copy of the GNU General Public License */
15 /* along with this program; if not, write to the Free Software Foundation, */
16 /* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
17 /* */
18 /******************************************************************************/
19
20 /******************************************************************************/
21 /* */
22 /* File: dma_thread_diotest7.c */
23 /* */
24 /* Description: The man page for open(2) states the following: */
25 /* O_DIRECT (Since Linux 2.6.10). Try to minimize cache effects of the I/O */
26 /* to and from this file. In general this will degrade performance, but it */
27 /* is useful in special situations, such as when applications do their own */
28 /* caching. File I/O is done directly to/from user space buffers. The I/O is*/
29 /* synchronous, that is, at the completion of a read(2) or write(2), data is*/
30 /* guranteed to have been transferred. Under Linux 2.4 transfer sizes, and */
31 /* the alignment of user buffer and file offset must all be multiples of */
32 /* the logical block size of the file system. Under Linux 2.6 alignment to */
33 /* 512-byte bound-aries suffices. */
34 /* However, it appears that data corruption may occur when a multithreaded */
35 /* process reads into a non-page size aligned user buffer. A test program */
36 /* which reliably reproduces the problem on ext3 and xfs is attached. The */
37 /* program creates, patterns, reads, and verify a series of files. In the */
38 /* read phase, a file is opened with O_DIRECT n times, where n is the */
39 /* number of cpu's. A single buffer large enough to contain the file is */
40 /* allocated and patterned with data not found in any of the files. The */
41 /* alignment of the buffer is controlled by a command line option. Each file*/
42 /* is read in parallel by n threads, where n is the number of cpu's. Thread */
43 /* 0 reads the first page of data from the file into the first page of the */
44 /* buffer, thread 1 reads the second page of data in to the second page of */
45 /* the buffer, and so on. Thread n - 1 reads the remainder of the file into*/
46 /* the remainder of the buffer. */
47 /* After a thread reads data into the buffer, it immediately verifies that */
48 /* the contents of the buffer are correct. If the buffer contains corrupt */
49 /* data, the thread dumps the data surrounding the corruption and calls */
50 /* abort(). Otherwise, the thread exits. */
51 /* Crucially, before the reader threads are dispatched, another thread is */
52 /* started which calls fork()/msleep() in a loop until all reads are compl- */
53 /* eted. The child created by fork() does nothing but call exit(0). A comm- */
54 /* and line option controls whether the buffer is aligned. In the case wh- */
55 /* ere the buffer is aligned on a page boundary, all is well. In the case */
56 /* where the buffer is aligned on a page + 512 byte offset, corruption is */
57 /* seen frequently. */
58 /* I believe that what is happening is that in the direct IO path, because */
59 /* the user's buffer is not aligned, some user pages are being mapped twice.*/
60 /* When a fork() happens in between the calls to map the page, the page will*/
61 /* be marked as COW. When the second map happens (via get_user_pages()), a */
62 /* new physical page will be allocated and copied. Thus, there is a race */
63 /* between the completion of the first read from disk (and write to the user*/
64 /* page) and get_user_pages() mapping the page for the second time. If the */
65 /* write does not complete before the page is copied, the user will see */
66 /* stale data in the first 512 bytes of this page of their buffer. Indeed, */
67 /* this is corruption most frequently seen. (It's also possible for the race*/
68 /* to be lost the other way, so that the last 3584 bytes of the page are */
69 /* stale.) */
70 /* The attached program (which is a heavily modified version of a program */
71 /* provided by a customer seeing this problem) reliably reproduces the pro- */
72 /* blem on any multicore linux machine on both ext3 and xfs, although any */
73 /* filesystem using the generic blockdev_direct_IO() routine is probably */
74 /* vulnerable. I've seen a few threads that mention the potential for this */
75 /* kind of problem, but no definitive solution or workaround (other than */
76 /* "Don't do that"). */
77 /* http://marc.info/?l=linux-mm&m=122668235304637&w=2 */
78 /* */
79 /* Total Tests: 1 */
80 /* */
81 /* Test Name: dma_thread_diotest7 */
82 /* */
83 /* Author: Tim LaBerge <tim.laberge@quantum.com> */
84 /* */
85 /* History: Reported - Jan 07 2009 - Li Zefan <lizf@cn.fujitsu.com> */
86 /* Ported - Jan 23 2009 - Subrata <subrata@linux.vnet.ibm.com> */
87 /* */
88 /******************************************************************************/
89
90 #define _GNU_SOURCE 1
91
92 #include <stdio.h>
93 #include <stdint.h>
94 #include <stdlib.h>
95 #include <fcntl.h>
96 #include <unistd.h>
97 #include <memory.h>
98 #include <pthread.h>
99 #include <getopt.h>
100 #include <errno.h>
101 #include <sys/types.h>
102 #include <sys/wait.h>
103 #include <sys/mount.h>
104
105 #include "test.h"
106 #include "safe_macros.h"
107
108 #define FILESIZE (12*1024*1024)
109 #define READSIZE (1024*1024)
110
111 #define MNT_POINT "mntpoint"
112 #define FILE_BASEPATH MNT_POINT "/_dma_thread_test_%.04d.tmp"
113 #define DIR_MODE (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP| \
114 S_IXGRP|S_IROTH|S_IXOTH)
115 #define FILECOUNT 100
116 #define PATTERN (0xfa)
117 #define PAGE_SIZE getpagesize()
118 #define MIN_WORKERS 2
119 #define MAX_WORKERS (READSIZE/PAGE_SIZE)
120
121 char *TCID = "dma_thread_diotest";
122 int TST_TOTAL = 1;
123
124 static void setup(void);
125 static void dma_thread_diotest_verify(void);
126 static void cleanup(void);
127 static void help(void);
128
129 static unsigned char *buffer;
130
131 static char *align_str;
132 static int align;
133 static char *workers_str;
134 static int workers;
135 static char *device;
136 static int mount_flag;
137 static option_t options[] = {
138 {"a:", NULL, &align_str},
139 {"w:", NULL, &workers_str},
140 {NULL, NULL, NULL}
141 };
142
143 static volatile int done;
144 static volatile int tst_result;
145
146 typedef struct {
147 pthread_t tid;
148 int worker_number;
149 int fd;
150 int offset;
151 int length;
152 int pattern;
153 unsigned char *buffer;
154 } worker_t;
155 static worker_t *worker;
156
worker_thread(void * arg)157 static void *worker_thread(void *arg)
158 {
159 int i, k;
160 int nread;
161 worker_t *worker = (worker_t *)arg;
162 int offset = worker->offset;
163 int fd = worker->fd;
164 unsigned char *buffer = worker->buffer;
165 int pattern = worker->pattern;
166 int length = worker->length;
167
168 if (lseek(fd, offset, SEEK_SET) < 0) {
169 fprintf(stderr, "Failed to lseek to %d on fd %d: %s.\n",
170 offset, fd, strerror(errno));
171 return (void *) 1;
172 }
173
174 nread = read(fd, buffer, length);
175 if (nread == -1 || nread != length) {
176 fprintf(stderr, "read failed in worker thread%d: %s",
177 worker->worker_number, strerror(errno));
178 return (void *) 1;
179 }
180
181 /* Corruption check */
182 for (i = 0; i < length; i++) {
183 if (buffer[i] != pattern) {
184 printf("Bad data at 0x%.06x: %p, \n", i, buffer + i);
185 printf("Data dump starting at 0x%.06x:\n", i - 8);
186 printf("Expect 0x%x followed by 0x%x:\n",
187 pattern, PATTERN);
188
189 for (k = 0; k < 16; k++) {
190 printf("%02x ", buffer[i - 8 + k]);
191 if (k == 7) {
192 printf("\n");
193 }
194 }
195
196 printf("\n");
197 tst_result = 1;
198 return NULL;
199 }
200 }
201
202 return NULL;
203 }
204
fork_thread(void * arg)205 static void *fork_thread(void *arg)
206 {
207 pid_t pid;
208
209 (void) arg;
210
211 while (!done) {
212 pid = tst_fork();
213 if (pid == 0) {
214 exit(0);
215 } else if (pid < 0) {
216 fprintf(stderr, "Failed to fork child: %s.\n",
217 strerror(errno));
218 return (void *) 1;
219 }
220 waitpid(pid, NULL, 0);
221 usleep(100);
222 }
223
224 return NULL;
225 }
226
main(int argc,char * argv[])227 int main(int argc, char *argv[])
228 {
229 int i, lc;
230
231 workers = sysconf(_SC_NPROCESSORS_ONLN);
232 if (workers > MAX_WORKERS)
233 workers = MAX_WORKERS;
234 tst_parse_opts(argc, argv, options, help);
235
236 setup();
237
238 for (lc = 0; TEST_LOOPING(lc); lc++) {
239 tst_count = 0;
240
241 for (i = 0; i < TST_TOTAL; i++)
242 dma_thread_diotest_verify();
243 }
244
245 cleanup();
246 tst_exit();
247 }
248
dma_thread_diotest_verify(void)249 static void dma_thread_diotest_verify(void)
250 {
251 int n, j, offset, rc;
252 void *retval;
253 char filename[PATH_MAX];
254 pthread_t fork_tid;
255
256 tst_result = 0;
257
258 for (n = 1; n <= FILECOUNT; n++) {
259 snprintf(filename, sizeof(filename), FILE_BASEPATH, n);
260 for (j = 0; j < workers; j++) {
261 worker[j].fd = SAFE_OPEN(cleanup, filename,
262 O_RDONLY | O_DIRECT);
263 worker[j].pattern = n;
264 }
265
266 tst_resm(TINFO, "Reading file %d.", n);
267
268 for (offset = 0; offset < FILESIZE; offset += READSIZE) {
269 memset(buffer, PATTERN, READSIZE + align);
270 for (j = 0; j < workers; j++) {
271 worker[j].offset = offset + j * PAGE_SIZE;
272 worker[j].buffer =
273 buffer + align + j * PAGE_SIZE;
274 worker[j].length = PAGE_SIZE;
275 }
276 /* The final worker reads whatever is left over. */
277 worker[workers - 1].length =
278 READSIZE - PAGE_SIZE * (workers - 1);
279
280 done = 0;
281
282 rc = pthread_create(&fork_tid, NULL, fork_thread, NULL);
283 if (rc != 0) {
284 tst_brkm(TBROK, cleanup, "pthread_create "
285 "failed: %s", strerror(rc));
286 }
287
288 for (j = 0; j < workers; j++) {
289 rc = pthread_create(&worker[j].tid, NULL,
290 worker_thread, worker + j);
291 if (rc != 0) {
292 tst_brkm(TBROK, cleanup, "Can't create"
293 "worker thread %d: %s",
294 j, strerror(rc));
295 }
296 }
297
298 for (j = 0; j < workers; j++) {
299 rc = pthread_join(worker[j].tid, &retval);
300 if (rc != 0) {
301 tst_brkm(TBROK, cleanup, "Failed to "
302 "join worker thread %d: %s.",
303 j, strerror(rc));
304 }
305 if ((intptr_t)retval != 0) {
306 tst_brkm(TBROK, cleanup, "there is"
307 "some errors in worker[%d],"
308 "return value: %ld",
309 j, (intptr_t)retval);
310 }
311 }
312
313 /* Let the fork thread know it's ok to exit */
314 done = 1;
315
316 rc = pthread_join(fork_tid, &retval);
317 if (rc != 0) {
318 tst_brkm(TBROK, cleanup,
319 "Failed to join fork thread: %s.",
320 strerror(rc));
321 }
322 if ((intptr_t)retval != 0) {
323 tst_brkm(TBROK, cleanup,
324 "fork() failed in fork thread:"
325 "return value: %ld", (intptr_t)retval);
326 }
327 }
328
329 /* Close the fd's for the next file. */
330 for (j = 0; j < workers; j++)
331 SAFE_CLOSE(cleanup, worker[j].fd);
332 if (tst_result)
333 break;
334 }
335
336 if (tst_result)
337 tst_resm(TFAIL, "data corruption is detected");
338 else
339 tst_resm(TPASS, "data corruption is not detected");
340 }
341
setup(void)342 static void setup(void)
343 {
344 char filename[PATH_MAX];
345 int n, j, fd, directflag = 1;
346 long type;
347
348 if (align_str) {
349 align = atoi(align_str);
350 if (align < 0 || align > PAGE_SIZE)
351 tst_brkm(TCONF, NULL, "Bad alignment %d.", align);
352 }
353 tst_resm(TINFO, "using alignment %d", align);
354
355 if (workers_str) {
356 workers = atoi(workers_str);
357 if (workers < MIN_WORKERS || workers > MAX_WORKERS) {
358 tst_brkm(TCONF, NULL, "Worker count %d not between "
359 "%d and %d, inclusive",
360 workers, MIN_WORKERS, MAX_WORKERS);
361 }
362 }
363 tst_resm(TINFO, "using %d workers.", workers);
364
365 tst_sig(FORK, DEF_HANDLER, NULL);
366 tst_require_root();
367
368 TEST_PAUSE;
369
370 tst_tmpdir();
371
372 /*
373 * Some file systems may not implement the O_DIRECT flag and open() will
374 * fail with EINVAL if it is used. So add this check for current
375 * filesystem current directory is in, if not supported, we choose to
376 * have this test in LTP_BIG_DEV and mkfs it as ext3.
377 */
378 fd = open("testfile", O_CREAT | O_DIRECT, 0644);
379 if (fd < 0 && errno == EINVAL) {
380 type = tst_fs_type(NULL, ".");
381 tst_resm(TINFO, "O_DIRECT flag is not supported on %s "
382 "filesystem", tst_fs_type_name(type));
383 directflag = 0;
384 } else if (fd > 0) {
385 SAFE_CLOSE(NULL, fd);
386 }
387
388 SAFE_MKDIR(cleanup, MNT_POINT, DIR_MODE);
389
390 /*
391 * verify whether the current directory has enough free space,
392 * if it is not satisfied, we will use the LTP_BIG_DEV, which
393 * will be exported by runltp with "-z" option.
394 */
395 if (!directflag || !tst_fs_has_free(NULL, ".", 1300, TST_MB)) {
396 device = getenv("LTP_BIG_DEV");
397 if (device == NULL) {
398 tst_brkm(TCONF, NULL,
399 "you must specify a big blockdevice(>1.3G)");
400 } else {
401 tst_mkfs(NULL, device, "ext3", NULL, NULL);
402 }
403
404 if (mount(device, MNT_POINT, "ext3", 0, NULL) < 0) {
405 tst_brkm(TBROK | TERRNO, NULL,
406 "mount device:%s failed", device);
407 }
408 mount_flag = 1;
409 }
410
411 worker = SAFE_MALLOC(cleanup, workers * sizeof(worker_t));
412
413 for (j = 0; j < workers; j++)
414 worker[j].worker_number = j;
415
416 for (n = 1; n <= FILECOUNT; n++) {
417 snprintf(filename, sizeof(filename), FILE_BASEPATH, n);
418
419 if (tst_fill_file(filename, n, FILESIZE, 1)) {
420 tst_brkm(TBROK, cleanup, "failed to create file: %s",
421 filename);
422 }
423 }
424
425 if (posix_memalign((void **)&buffer, PAGE_SIZE, READSIZE + align) != 0)
426 tst_brkm(TBROK, cleanup, "call posix_memalign failed");
427 }
428
cleanup(void)429 static void cleanup(void)
430 {
431 free(buffer);
432
433 if (mount_flag && tst_umount(MNT_POINT) < 0)
434 tst_resm(TWARN | TERRNO, "umount device:%s failed", device);
435
436 free(worker);
437
438 tst_rmdir();
439 }
440
help(void)441 static void help(void)
442 {
443 printf("-a align read buffer to offset <alignment>.\n");
444 printf("-w number of worker threads, 2 (default) to %d,"
445 " defaults to number of cores.\n", MAX_WORKERS);
446 }
447