Monday, January 25, 2010

Zero Copy - Mapping buffer into pages for disk IO

I needed to implement zero-copy for a block device driver. It turned out that a lot of IO in the driver was happening through buffers and earlier each IO involved page allocations and copying of data from page to buffer. This naturally ate up lot of CPU and needed improvements.

While implementing I did not find good example code in Linux kernel, due to which I ended up wasting some time in investigations. Some issues to consider:

1) Don't assume that all memory was kmalloc'ed. Check using is_vmalloc_addr() what type of memory it is.
2) On some architectures, even kmalloc allocations will cross page boundaries

void buffer_disk_io(struct my_req * req)
{
        int is_vmalloc;
        int count;
        unsigned int len;
        struct bio * bio;
        struct page * pg;
        unsigned int offset;
        unsigned int sector;
        void *addr;

        count = (req->num_sectors-1) / (PAGE_SIZE / SECTOR_SIZE) + 1;

        /* The buffer may not start from page boundary in some cases
         * but it can cross page boundaries */
        offset = offset_in_page(req->buffer);
        if (offset && (offset + (req->num_sectors << 9)) > PAGE_SIZE)
                count++;

        bio = bio_alloc(GFP_NOIO, count);
        if (bio == NULL) {
                req->status.syserr = -ENOMEM;
                req->status.code = DS_ERR_UNKNOWN;
                return;
        }

        bio->bi_bdev = req->path->bdev;
        bio->bi_sector = req->start_sector;

        bio->bi_private = req;
        bio->bi_end_io = __end_io_indirect;


        /* Check if memory is vmalloc'ed or kmalloc'ed */
        is_vmalloc = is_vmalloc_addr(req->buffer);

        sector = 0;
        while (sector < req->num_sectors) {
                addr = (req->buffer + (sector << 9));

                if (is_vmalloc) {
                        pg = vmalloc_to_page(addr);
                } else {
                        pg = virt_to_page(addr);
                        get_page(pg);
                }

                offset = offset_in_page(addr);

                /* Consider case when offset in not on page boundary and it may
                 * or may not cross page boundaries */
                if ((req->num_sectors - sector) >= (PAGE_SIZE / DS_SECTOR_SIZE))
                        len = PAGE_SIZE - offset;
                else if (offset + ((req->num_sectors - sector) << 9) < PAGE_SIZE)
                        len = (req->num_sectors - sector) << 9;
                else
                        len = PAGE_SIZE - offset;

                if (!bio_add_page(bio, pg, len, offset))
                        goto failed;

                sector += (len >> 9);
        }

        /* set command */
        if (req->cmd == IO_READ) {
                bio->bi_rw = READ;
        } else if(req->cmd == IO_WRITE) {
                bio->bi_rw = WRITE;
        }
        bio->bi_rw |= (1UL<

        generic_make_request(bio);

        return;


failed:
        if(bio != NULL)
                __end_io_indirect(bio, 0, -ENOMEM);

        return;
}

No comments: