commit 10dce8af34226d90fa56746a934f8da5dcdba3df upstream. Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added locking for file.f_pos access and in particular made concurrent read and write not possible - now both those functions take f_pos lock for the whole run, and so if e.g. a read is blocked waiting for data, write will deadlock waiting for that read to complete. This caused regression for stream-like files where previously read and write could run simultaneously, but after that patch could not do so anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") which fixes such regression for particular case of /proc/xen/xenbus. The patch that added f_pos lock in 2014 did so to guarantee POSIX thread safety for read/write/lseek and added the locking to file descriptors of all regular files. In 2014 that thread-safety problem was not new as it was already discussed earlier in 2006. However even though 2006'th version of Linus's patch was adding f_pos locking "only for files that are marked seekable with FMODE_LSEEK (thus avoiding the stream-like objects like pipes and sockets)", the 2014 version - the one that actually made it into the tree as 9c225f2655e3 - is doing so irregardless of whether a file is seekable or not. See https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/ https://lwn.net/Articles/180387 https://lwn.net/Articles/180396 for historic context. The reason that it did so is, probably, that there are many files that are marked non-seekable, but e.g. their read implementation actually depends on knowing current position to correctly handle the read. Some examples: kernel/power/user.c snapshot_read fs/debugfs/file.c u32_array_read fs/fuse/control.c fuse_conn_waiting_read + ... drivers/hwmon/asus_atk0110.c atk_debugfs_ggrp_read arch/s390/hypfs/inode.c hypfs_read_iter ... Despite that, many nonseekable_open users implement read and write with pure stream semantics - they don't depend on passed ppos at all. And for those cases where read could wait for something inside, it creates a situation similar to xenbus - the write could be never made to go until read is done, and read is waiting for some, potentially external, event, for potentially unbounded time -> deadlock. Besides xenbus, there are 14 such places in the kernel that I've found with semantic patch (see below): drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write() drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write() drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write() drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write() net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write() drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write() drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write() drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write() net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write() drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write() drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write() drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write() drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write() drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write() In addition to the cases above another regression caused by f_pos locking is that now FUSE filesystems that implement open with FOPEN_NONSEEKABLE flag, can no longer implement bidirectional stream-like files - for the same reason as above e.g. read can deadlock write locking on file.f_pos in the kernel. FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse: implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and write routines not depending on current position at all, and with both read and write being potentially blocking operations: See https://github.com/libfuse/osspd https://lwn.net/Articles/308445 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510 Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as "somewhat pipe-like files ..." with read handler not using offset. However that test implements only read without write and cannot exercise the deadlock scenario: https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216 I've actually hit the read vs write deadlock for real while implementing my FUSE filesystem where there is /head/watch file, for which open creates separate bidirectional socket-like stream in between filesystem and its user with both read and write being later performed simultaneously. And there it is semantically not easy to split the stream into two separate read-only and write-only channels: https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169 Let's fix this regression. The plan is: 1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS - doing so would break many in-kernel nonseekable_open users which actually use ppos in read/write handlers. 2. Add stream_open() to kernel to open stream-like non-seekable file descriptors. Read and write on such file descriptors would never use nor change ppos. And with that property on stream-like files read and write will be running without taking f_pos lock - i.e. read and write could be running simultaneously. 3. With semantic patch search and convert to stream_open all in-kernel nonseekable_open users for which read and write actually do not depend on ppos and where there is no other methods in file_operations which assume @offset access. 4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via steam_open if that bit is present in filesystem open reply. It was tempting to change fs/fuse/ open handler to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. 5. Add stream_open and FOPEN_STREAM handling to stable kernels starting from v3.14+ (the kernel where 9c225f2655 first appeared). This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in their open handler and this way avoid the deadlock on all kernel versions. This should work because fs/fuse/ ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. This patch adds stream_open, converts /proc/xen/xenbus to it and adds semantic patch to automatically locate in-kernel places that are either required to be converted due to read vs write deadlock, or that are just safe to be converted because read and write do not use ppos and there are no other funky methods in file_operations. Regarding semantic patch I've verified each generated change manually - that it is correct to convert - and each other nonseekable_open instance left - that it is either not correct to convert there, or that it is not converted due to current stream_open.cocci limitations. The script also does not convert files that should be valid to convert, but that currently have .llseek = noop_llseek or generic_file_llseek for unknown reason despite file being opened with nonseekable_open (e.g. drivers/input/mousedev.c) Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Yongzhi Pan <panyongzhi@gmail.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: David Vrabel <david.vrabel@citrix.com> Cc: Juergen Gross <jgross@suse.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Tejun Heo <tj@kernel.org> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Christoph Hellwig <hch@lst.de> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Julia Lawall <Julia.Lawall@lip6.fr> Cc: Nikolaus Rath <Nikolaus@rath.org> Cc: Han-Wen Nienhuys <hanwen@google.com> [ backport to 4.4: actually fixed deadlock on /proc/xen/xenbus as 581d21a2d02a was not backported to 4.4 ] Signed-off-by: Kirill Smelkov <kirr@nexedi.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1333 lines
29 KiB
C
1333 lines
29 KiB
C
/*
|
|
* linux/fs/read_write.c
|
|
*
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/stat.h>
|
|
#include <linux/fcntl.h>
|
|
#include <linux/file.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/fsnotify.h>
|
|
#include <linux/security.h>
|
|
#include <linux/export.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/splice.h>
|
|
#include <linux/compat.h>
|
|
#include "internal.h"
|
|
|
|
#include <asm/uaccess.h>
|
|
#include <asm/unistd.h>
|
|
|
|
typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
|
|
typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
|
|
|
|
const struct file_operations generic_ro_fops = {
|
|
.llseek = generic_file_llseek,
|
|
.read_iter = generic_file_read_iter,
|
|
.mmap = generic_file_readonly_mmap,
|
|
.splice_read = generic_file_splice_read,
|
|
};
|
|
|
|
EXPORT_SYMBOL(generic_ro_fops);
|
|
|
|
static inline int unsigned_offsets(struct file *file)
|
|
{
|
|
return file->f_mode & FMODE_UNSIGNED_OFFSET;
|
|
}
|
|
|
|
/**
|
|
* vfs_setpos - update the file offset for lseek
|
|
* @file: file structure in question
|
|
* @offset: file offset to seek to
|
|
* @maxsize: maximum file size
|
|
*
|
|
* This is a low-level filesystem helper for updating the file offset to
|
|
* the value specified by @offset if the given offset is valid and it is
|
|
* not equal to the current file offset.
|
|
*
|
|
* Return the specified offset on success and -EINVAL on invalid offset.
|
|
*/
|
|
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
|
|
{
|
|
if (offset < 0 && !unsigned_offsets(file))
|
|
return -EINVAL;
|
|
if (offset > maxsize)
|
|
return -EINVAL;
|
|
|
|
if (offset != file->f_pos) {
|
|
file->f_pos = offset;
|
|
file->f_version = 0;
|
|
}
|
|
return offset;
|
|
}
|
|
EXPORT_SYMBOL(vfs_setpos);
|
|
|
|
/**
|
|
* generic_file_llseek_size - generic llseek implementation for regular files
|
|
* @file: file structure to seek on
|
|
* @offset: file offset to seek to
|
|
* @whence: type of seek
|
|
* @size: max size of this file in file system
|
|
* @eof: offset used for SEEK_END position
|
|
*
|
|
* This is a variant of generic_file_llseek that allows passing in a custom
|
|
* maximum file size and a custom EOF position, for e.g. hashed directories
|
|
*
|
|
* Synchronization:
|
|
* SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
|
|
* SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
|
|
* read/writes behave like SEEK_SET against seeks.
|
|
*/
|
|
loff_t
|
|
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
|
|
loff_t maxsize, loff_t eof)
|
|
{
|
|
switch (whence) {
|
|
case SEEK_END:
|
|
offset += eof;
|
|
break;
|
|
case SEEK_CUR:
|
|
/*
|
|
* Here we special-case the lseek(fd, 0, SEEK_CUR)
|
|
* position-querying operation. Avoid rewriting the "same"
|
|
* f_pos value back to the file because a concurrent read(),
|
|
* write() or lseek() might have altered it
|
|
*/
|
|
if (offset == 0)
|
|
return file->f_pos;
|
|
/*
|
|
* f_lock protects against read/modify/write race with other
|
|
* SEEK_CURs. Note that parallel writes and reads behave
|
|
* like SEEK_SET.
|
|
*/
|
|
spin_lock(&file->f_lock);
|
|
offset = vfs_setpos(file, file->f_pos + offset, maxsize);
|
|
spin_unlock(&file->f_lock);
|
|
return offset;
|
|
case SEEK_DATA:
|
|
/*
|
|
* In the generic case the entire file is data, so as long as
|
|
* offset isn't at the end of the file then the offset is data.
|
|
*/
|
|
if ((unsigned long long)offset >= eof)
|
|
return -ENXIO;
|
|
break;
|
|
case SEEK_HOLE:
|
|
/*
|
|
* There is a virtual hole at the end of the file, so as long as
|
|
* offset isn't i_size or larger, return i_size.
|
|
*/
|
|
if ((unsigned long long)offset >= eof)
|
|
return -ENXIO;
|
|
offset = eof;
|
|
break;
|
|
}
|
|
|
|
return vfs_setpos(file, offset, maxsize);
|
|
}
|
|
EXPORT_SYMBOL(generic_file_llseek_size);
|
|
|
|
/**
|
|
* generic_file_llseek - generic llseek implementation for regular files
|
|
* @file: file structure to seek on
|
|
* @offset: file offset to seek to
|
|
* @whence: type of seek
|
|
*
|
|
* This is a generic implemenation of ->llseek useable for all normal local
|
|
* filesystems. It just updates the file offset to the value specified by
|
|
* @offset and @whence.
|
|
*/
|
|
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
|
return generic_file_llseek_size(file, offset, whence,
|
|
inode->i_sb->s_maxbytes,
|
|
i_size_read(inode));
|
|
}
|
|
EXPORT_SYMBOL(generic_file_llseek);
|
|
|
|
/**
|
|
* fixed_size_llseek - llseek implementation for fixed-sized devices
|
|
* @file: file structure to seek on
|
|
* @offset: file offset to seek to
|
|
* @whence: type of seek
|
|
* @size: size of the file
|
|
*
|
|
*/
|
|
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
|
|
{
|
|
switch (whence) {
|
|
case SEEK_SET: case SEEK_CUR: case SEEK_END:
|
|
return generic_file_llseek_size(file, offset, whence,
|
|
size, size);
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(fixed_size_llseek);
|
|
|
|
/**
|
|
* noop_llseek - No Operation Performed llseek implementation
|
|
* @file: file structure to seek on
|
|
* @offset: file offset to seek to
|
|
* @whence: type of seek
|
|
*
|
|
* This is an implementation of ->llseek useable for the rare special case when
|
|
* userspace expects the seek to succeed but the (device) file is actually not
|
|
* able to perform the seek. In this case you use noop_llseek() instead of
|
|
* falling back to the default implementation of ->llseek.
|
|
*/
|
|
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
return file->f_pos;
|
|
}
|
|
EXPORT_SYMBOL(noop_llseek);
|
|
|
|
loff_t no_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
return -ESPIPE;
|
|
}
|
|
EXPORT_SYMBOL(no_llseek);
|
|
|
|
loff_t default_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
loff_t retval;
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
switch (whence) {
|
|
case SEEK_END:
|
|
offset += i_size_read(inode);
|
|
break;
|
|
case SEEK_CUR:
|
|
if (offset == 0) {
|
|
retval = file->f_pos;
|
|
goto out;
|
|
}
|
|
offset += file->f_pos;
|
|
break;
|
|
case SEEK_DATA:
|
|
/*
|
|
* In the generic case the entire file is data, so as
|
|
* long as offset isn't at the end of the file then the
|
|
* offset is data.
|
|
*/
|
|
if (offset >= inode->i_size) {
|
|
retval = -ENXIO;
|
|
goto out;
|
|
}
|
|
break;
|
|
case SEEK_HOLE:
|
|
/*
|
|
* There is a virtual hole at the end of the file, so
|
|
* as long as offset isn't i_size or larger, return
|
|
* i_size.
|
|
*/
|
|
if (offset >= inode->i_size) {
|
|
retval = -ENXIO;
|
|
goto out;
|
|
}
|
|
offset = inode->i_size;
|
|
break;
|
|
}
|
|
retval = -EINVAL;
|
|
if (offset >= 0 || unsigned_offsets(file)) {
|
|
if (offset != file->f_pos) {
|
|
file->f_pos = offset;
|
|
file->f_version = 0;
|
|
}
|
|
retval = offset;
|
|
}
|
|
out:
|
|
mutex_unlock(&inode->i_mutex);
|
|
return retval;
|
|
}
|
|
EXPORT_SYMBOL(default_llseek);
|
|
|
|
loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
loff_t (*fn)(struct file *, loff_t, int);
|
|
|
|
fn = no_llseek;
|
|
if (file->f_mode & FMODE_LSEEK) {
|
|
if (file->f_op->llseek)
|
|
fn = file->f_op->llseek;
|
|
}
|
|
return fn(file, offset, whence);
|
|
}
|
|
EXPORT_SYMBOL(vfs_llseek);
|
|
|
|
static inline struct fd fdget_pos(int fd)
|
|
{
|
|
return __to_fd(__fdget_pos(fd));
|
|
}
|
|
|
|
static inline void fdput_pos(struct fd f)
|
|
{
|
|
if (f.flags & FDPUT_POS_UNLOCK)
|
|
mutex_unlock(&f.file->f_pos_lock);
|
|
fdput(f);
|
|
}
|
|
|
|
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
|
|
{
|
|
off_t retval;
|
|
struct fd f = fdget_pos(fd);
|
|
if (!f.file)
|
|
return -EBADF;
|
|
|
|
retval = -EINVAL;
|
|
if (whence <= SEEK_MAX) {
|
|
loff_t res = vfs_llseek(f.file, offset, whence);
|
|
retval = res;
|
|
if (res != (loff_t)retval)
|
|
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
|
|
}
|
|
fdput_pos(f);
|
|
return retval;
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
|
|
{
|
|
return sys_lseek(fd, offset, whence);
|
|
}
|
|
#endif
|
|
|
|
#ifdef __ARCH_WANT_SYS_LLSEEK
|
|
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
|
|
unsigned long, offset_low, loff_t __user *, result,
|
|
unsigned int, whence)
|
|
{
|
|
int retval;
|
|
struct fd f = fdget_pos(fd);
|
|
loff_t offset;
|
|
|
|
if (!f.file)
|
|
return -EBADF;
|
|
|
|
retval = -EINVAL;
|
|
if (whence > SEEK_MAX)
|
|
goto out_putf;
|
|
|
|
offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
|
|
whence);
|
|
|
|
retval = (int)offset;
|
|
if (offset >= 0) {
|
|
retval = -EFAULT;
|
|
if (!copy_to_user(result, &offset, sizeof(offset)))
|
|
retval = 0;
|
|
}
|
|
out_putf:
|
|
fdput_pos(f);
|
|
return retval;
|
|
}
|
|
#endif
|
|
|
|
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
|
|
{
|
|
struct kiocb kiocb;
|
|
ssize_t ret;
|
|
|
|
if (!file->f_op->read_iter)
|
|
return -EINVAL;
|
|
|
|
init_sync_kiocb(&kiocb, file);
|
|
kiocb.ki_pos = *ppos;
|
|
|
|
iter->type |= READ;
|
|
ret = file->f_op->read_iter(&kiocb, iter);
|
|
BUG_ON(ret == -EIOCBQUEUED);
|
|
if (ret > 0)
|
|
*ppos = kiocb.ki_pos;
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(vfs_iter_read);
|
|
|
|
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
|
|
{
|
|
struct kiocb kiocb;
|
|
ssize_t ret;
|
|
|
|
if (!file->f_op->write_iter)
|
|
return -EINVAL;
|
|
|
|
init_sync_kiocb(&kiocb, file);
|
|
kiocb.ki_pos = *ppos;
|
|
|
|
iter->type |= WRITE;
|
|
ret = file->f_op->write_iter(&kiocb, iter);
|
|
BUG_ON(ret == -EIOCBQUEUED);
|
|
if (ret > 0) {
|
|
*ppos = kiocb.ki_pos;
|
|
fsnotify_modify(file);
|
|
}
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(vfs_iter_write);
|
|
|
|
/*
|
|
* rw_verify_area doesn't like huge counts. We limit
|
|
* them to something that fits in "int" so that others
|
|
* won't have to do range checks all the time.
|
|
*/
|
|
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
|
|
{
|
|
struct inode *inode;
|
|
loff_t pos;
|
|
int retval = -EINVAL;
|
|
|
|
inode = file_inode(file);
|
|
if (unlikely((ssize_t) count < 0))
|
|
return retval;
|
|
pos = *ppos;
|
|
if (unlikely(pos < 0)) {
|
|
if (!unsigned_offsets(file))
|
|
return retval;
|
|
if (count >= -pos) /* both values are in 0..LLONG_MAX */
|
|
return -EOVERFLOW;
|
|
} else if (unlikely((loff_t) (pos + count) < 0)) {
|
|
if (!unsigned_offsets(file))
|
|
return retval;
|
|
}
|
|
|
|
if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
|
|
retval = locks_mandatory_area(
|
|
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
|
|
inode, file, pos, count);
|
|
if (retval < 0)
|
|
return retval;
|
|
}
|
|
retval = security_file_permission(file,
|
|
read_write == READ ? MAY_READ : MAY_WRITE);
|
|
if (retval)
|
|
return retval;
|
|
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
|
|
}
|
|
|
|
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
|
|
{
|
|
struct iovec iov = { .iov_base = buf, .iov_len = len };
|
|
struct kiocb kiocb;
|
|
struct iov_iter iter;
|
|
ssize_t ret;
|
|
|
|
init_sync_kiocb(&kiocb, filp);
|
|
kiocb.ki_pos = *ppos;
|
|
iov_iter_init(&iter, READ, &iov, 1, len);
|
|
|
|
ret = filp->f_op->read_iter(&kiocb, &iter);
|
|
BUG_ON(ret == -EIOCBQUEUED);
|
|
*ppos = kiocb.ki_pos;
|
|
return ret;
|
|
}
|
|
|
|
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
|
|
loff_t *pos)
|
|
{
|
|
if (file->f_op->read)
|
|
return file->f_op->read(file, buf, count, pos);
|
|
else if (file->f_op->read_iter)
|
|
return new_sync_read(file, buf, count, pos);
|
|
else
|
|
return -EINVAL;
|
|
}
|
|
EXPORT_SYMBOL(__vfs_read);
|
|
|
|
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
|
|
{
|
|
ssize_t ret;
|
|
|
|
if (!(file->f_mode & FMODE_READ))
|
|
return -EBADF;
|
|
if (!(file->f_mode & FMODE_CAN_READ))
|
|
return -EINVAL;
|
|
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
|
|
return -EFAULT;
|
|
|
|
ret = rw_verify_area(READ, file, pos, count);
|
|
if (ret >= 0) {
|
|
count = ret;
|
|
ret = __vfs_read(file, buf, count, pos);
|
|
if (ret > 0) {
|
|
fsnotify_access(file);
|
|
add_rchar(current, ret);
|
|
}
|
|
inc_syscr(current);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL(vfs_read);
|
|
|
|
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
|
|
{
|
|
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
|
|
struct kiocb kiocb;
|
|
struct iov_iter iter;
|
|
ssize_t ret;
|
|
|
|
init_sync_kiocb(&kiocb, filp);
|
|
kiocb.ki_pos = *ppos;
|
|
iov_iter_init(&iter, WRITE, &iov, 1, len);
|
|
|
|
ret = filp->f_op->write_iter(&kiocb, &iter);
|
|
BUG_ON(ret == -EIOCBQUEUED);
|
|
if (ret > 0)
|
|
*ppos = kiocb.ki_pos;
|
|
return ret;
|
|
}
|
|
|
|
ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
|
|
loff_t *pos)
|
|
{
|
|
if (file->f_op->write)
|
|
return file->f_op->write(file, p, count, pos);
|
|
else if (file->f_op->write_iter)
|
|
return new_sync_write(file, p, count, pos);
|
|
else
|
|
return -EINVAL;
|
|
}
|
|
EXPORT_SYMBOL(__vfs_write);
|
|
|
|
ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
|
|
{
|
|
mm_segment_t old_fs;
|
|
const char __user *p;
|
|
ssize_t ret;
|
|
|
|
if (!(file->f_mode & FMODE_CAN_WRITE))
|
|
return -EINVAL;
|
|
|
|
old_fs = get_fs();
|
|
set_fs(get_ds());
|
|
p = (__force const char __user *)buf;
|
|
if (count > MAX_RW_COUNT)
|
|
count = MAX_RW_COUNT;
|
|
ret = __vfs_write(file, p, count, pos);
|
|
set_fs(old_fs);
|
|
if (ret > 0) {
|
|
fsnotify_modify(file);
|
|
add_wchar(current, ret);
|
|
}
|
|
inc_syscw(current);
|
|
return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL(__kernel_write);
|
|
|
|
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
|
|
{
|
|
ssize_t ret;
|
|
|
|
if (!(file->f_mode & FMODE_WRITE))
|
|
return -EBADF;
|
|
if (!(file->f_mode & FMODE_CAN_WRITE))
|
|
return -EINVAL;
|
|
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
|
|
return -EFAULT;
|
|
|
|
ret = rw_verify_area(WRITE, file, pos, count);
|
|
if (ret >= 0) {
|
|
count = ret;
|
|
file_start_write(file);
|
|
ret = __vfs_write(file, buf, count, pos);
|
|
if (ret > 0) {
|
|
fsnotify_modify(file);
|
|
add_wchar(current, ret);
|
|
}
|
|
inc_syscw(current);
|
|
file_end_write(file);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL(vfs_write);
|
|
|
|
static inline loff_t file_pos_read(struct file *file)
|
|
{
|
|
return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
|
|
}
|
|
|
|
static inline void file_pos_write(struct file *file, loff_t pos)
|
|
{
|
|
if ((file->f_mode & FMODE_STREAM) == 0)
|
|
file->f_pos = pos;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (f.file) {
|
|
loff_t pos = file_pos_read(f.file);
|
|
ret = vfs_read(f.file, buf, count, &pos);
|
|
if (ret >= 0)
|
|
file_pos_write(f.file, pos);
|
|
fdput_pos(f);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
|
|
size_t, count)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (f.file) {
|
|
loff_t pos = file_pos_read(f.file);
|
|
ret = vfs_write(f.file, buf, count, &pos);
|
|
if (ret >= 0)
|
|
file_pos_write(f.file, pos);
|
|
fdput_pos(f);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
|
|
size_t, count, loff_t, pos)
|
|
{
|
|
struct fd f;
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
|
|
f = fdget(fd);
|
|
if (f.file) {
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PREAD)
|
|
ret = vfs_read(f.file, buf, count, &pos);
|
|
fdput(f);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
|
|
size_t, count, loff_t, pos)
|
|
{
|
|
struct fd f;
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
|
|
f = fdget(fd);
|
|
if (f.file) {
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PWRITE)
|
|
ret = vfs_write(f.file, buf, count, &pos);
|
|
fdput(f);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Reduce an iovec's length in-place. Return the resulting number of segments
|
|
*/
|
|
unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
|
|
{
|
|
unsigned long seg = 0;
|
|
size_t len = 0;
|
|
|
|
while (seg < nr_segs) {
|
|
seg++;
|
|
if (len + iov->iov_len >= to) {
|
|
iov->iov_len = to - len;
|
|
break;
|
|
}
|
|
len += iov->iov_len;
|
|
iov++;
|
|
}
|
|
return seg;
|
|
}
|
|
EXPORT_SYMBOL(iov_shorten);
|
|
|
|
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
|
|
loff_t *ppos, iter_fn_t fn)
|
|
{
|
|
struct kiocb kiocb;
|
|
ssize_t ret;
|
|
|
|
init_sync_kiocb(&kiocb, filp);
|
|
kiocb.ki_pos = *ppos;
|
|
|
|
ret = fn(&kiocb, iter);
|
|
BUG_ON(ret == -EIOCBQUEUED);
|
|
*ppos = kiocb.ki_pos;
|
|
return ret;
|
|
}
|
|
|
|
/* Do it by hand, with file-ops */
|
|
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
|
|
loff_t *ppos, io_fn_t fn)
|
|
{
|
|
ssize_t ret = 0;
|
|
|
|
while (iov_iter_count(iter)) {
|
|
struct iovec iovec = iov_iter_iovec(iter);
|
|
ssize_t nr;
|
|
|
|
nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
|
|
|
|
if (nr < 0) {
|
|
if (!ret)
|
|
ret = nr;
|
|
break;
|
|
}
|
|
ret += nr;
|
|
if (nr != iovec.iov_len)
|
|
break;
|
|
iov_iter_advance(iter, nr);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* A write operation does a read from user space and vice versa */
|
|
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
|
|
|
|
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
|
|
unsigned long nr_segs, unsigned long fast_segs,
|
|
struct iovec *fast_pointer,
|
|
struct iovec **ret_pointer)
|
|
{
|
|
unsigned long seg;
|
|
ssize_t ret;
|
|
struct iovec *iov = fast_pointer;
|
|
|
|
/*
|
|
* SuS says "The readv() function *may* fail if the iovcnt argument
|
|
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
|
|
* traditionally returned zero for zero segments, so...
|
|
*/
|
|
if (nr_segs == 0) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* First get the "struct iovec" from user memory and
|
|
* verify all the pointers
|
|
*/
|
|
if (nr_segs > UIO_MAXIOV) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
if (nr_segs > fast_segs) {
|
|
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
|
|
if (iov == NULL) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
}
|
|
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* According to the Single Unix Specification we should return EINVAL
|
|
* if an element length is < 0 when cast to ssize_t or if the
|
|
* total length would overflow the ssize_t return value of the
|
|
* system call.
|
|
*
|
|
* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
|
|
* overflow case.
|
|
*/
|
|
ret = 0;
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
void __user *buf = iov[seg].iov_base;
|
|
ssize_t len = (ssize_t)iov[seg].iov_len;
|
|
|
|
/* see if we we're about to use an invalid len or if
|
|
* it's about to overflow ssize_t */
|
|
if (len < 0) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
if (type >= 0
|
|
&& unlikely(!access_ok(vrfy_dir(type), buf, len))) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
if (len > MAX_RW_COUNT - ret) {
|
|
len = MAX_RW_COUNT - ret;
|
|
iov[seg].iov_len = len;
|
|
}
|
|
ret += len;
|
|
}
|
|
out:
|
|
*ret_pointer = iov;
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t do_readv_writev(int type, struct file *file,
|
|
const struct iovec __user * uvector,
|
|
unsigned long nr_segs, loff_t *pos)
|
|
{
|
|
size_t tot_len;
|
|
struct iovec iovstack[UIO_FASTIOV];
|
|
struct iovec *iov = iovstack;
|
|
struct iov_iter iter;
|
|
ssize_t ret;
|
|
io_fn_t fn;
|
|
iter_fn_t iter_fn;
|
|
|
|
ret = import_iovec(type, uvector, nr_segs,
|
|
ARRAY_SIZE(iovstack), &iov, &iter);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
tot_len = iov_iter_count(&iter);
|
|
if (!tot_len)
|
|
goto out;
|
|
ret = rw_verify_area(type, file, pos, tot_len);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
if (type == READ) {
|
|
fn = file->f_op->read;
|
|
iter_fn = file->f_op->read_iter;
|
|
} else {
|
|
fn = (io_fn_t)file->f_op->write;
|
|
iter_fn = file->f_op->write_iter;
|
|
file_start_write(file);
|
|
}
|
|
|
|
if (iter_fn)
|
|
ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
|
|
else
|
|
ret = do_loop_readv_writev(file, &iter, pos, fn);
|
|
|
|
if (type != READ)
|
|
file_end_write(file);
|
|
|
|
out:
|
|
kfree(iov);
|
|
if ((ret + (type == READ)) > 0) {
|
|
if (type == READ)
|
|
fsnotify_access(file);
|
|
else
|
|
fsnotify_modify(file);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
|
|
unsigned long vlen, loff_t *pos)
|
|
{
|
|
if (!(file->f_mode & FMODE_READ))
|
|
return -EBADF;
|
|
if (!(file->f_mode & FMODE_CAN_READ))
|
|
return -EINVAL;
|
|
|
|
return do_readv_writev(READ, file, vec, vlen, pos);
|
|
}
|
|
|
|
EXPORT_SYMBOL(vfs_readv);
|
|
|
|
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
|
|
unsigned long vlen, loff_t *pos)
|
|
{
|
|
if (!(file->f_mode & FMODE_WRITE))
|
|
return -EBADF;
|
|
if (!(file->f_mode & FMODE_CAN_WRITE))
|
|
return -EINVAL;
|
|
|
|
return do_readv_writev(WRITE, file, vec, vlen, pos);
|
|
}
|
|
|
|
EXPORT_SYMBOL(vfs_writev);
|
|
|
|
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
|
|
unsigned long, vlen)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (f.file) {
|
|
loff_t pos = file_pos_read(f.file);
|
|
ret = vfs_readv(f.file, vec, vlen, &pos);
|
|
if (ret >= 0)
|
|
file_pos_write(f.file, pos);
|
|
fdput_pos(f);
|
|
}
|
|
|
|
if (ret > 0)
|
|
add_rchar(current, ret);
|
|
inc_syscr(current);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
|
|
unsigned long, vlen)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (f.file) {
|
|
loff_t pos = file_pos_read(f.file);
|
|
ret = vfs_writev(f.file, vec, vlen, &pos);
|
|
if (ret >= 0)
|
|
file_pos_write(f.file, pos);
|
|
fdput_pos(f);
|
|
}
|
|
|
|
if (ret > 0)
|
|
add_wchar(current, ret);
|
|
inc_syscw(current);
|
|
return ret;
|
|
}
|
|
|
|
static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
|
|
{
|
|
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
|
|
return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
|
|
}
|
|
|
|
SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
|
|
unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
|
|
{
|
|
loff_t pos = pos_from_hilo(pos_h, pos_l);
|
|
struct fd f;
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
|
|
f = fdget(fd);
|
|
if (f.file) {
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PREAD)
|
|
ret = vfs_readv(f.file, vec, vlen, &pos);
|
|
fdput(f);
|
|
}
|
|
|
|
if (ret > 0)
|
|
add_rchar(current, ret);
|
|
inc_syscr(current);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
|
|
unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
|
|
{
|
|
loff_t pos = pos_from_hilo(pos_h, pos_l);
|
|
struct fd f;
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
|
|
f = fdget(fd);
|
|
if (f.file) {
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PWRITE)
|
|
ret = vfs_writev(f.file, vec, vlen, &pos);
|
|
fdput(f);
|
|
}
|
|
|
|
if (ret > 0)
|
|
add_wchar(current, ret);
|
|
inc_syscw(current);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
static ssize_t compat_do_readv_writev(int type, struct file *file,
|
|
const struct compat_iovec __user *uvector,
|
|
unsigned long nr_segs, loff_t *pos)
|
|
{
|
|
compat_ssize_t tot_len;
|
|
struct iovec iovstack[UIO_FASTIOV];
|
|
struct iovec *iov = iovstack;
|
|
struct iov_iter iter;
|
|
ssize_t ret;
|
|
io_fn_t fn;
|
|
iter_fn_t iter_fn;
|
|
|
|
ret = compat_import_iovec(type, uvector, nr_segs,
|
|
UIO_FASTIOV, &iov, &iter);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
tot_len = iov_iter_count(&iter);
|
|
if (!tot_len)
|
|
goto out;
|
|
ret = rw_verify_area(type, file, pos, tot_len);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
if (type == READ) {
|
|
fn = file->f_op->read;
|
|
iter_fn = file->f_op->read_iter;
|
|
} else {
|
|
fn = (io_fn_t)file->f_op->write;
|
|
iter_fn = file->f_op->write_iter;
|
|
file_start_write(file);
|
|
}
|
|
|
|
if (iter_fn)
|
|
ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
|
|
else
|
|
ret = do_loop_readv_writev(file, &iter, pos, fn);
|
|
|
|
if (type != READ)
|
|
file_end_write(file);
|
|
|
|
out:
|
|
kfree(iov);
|
|
if ((ret + (type == READ)) > 0) {
|
|
if (type == READ)
|
|
fsnotify_access(file);
|
|
else
|
|
fsnotify_modify(file);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static size_t compat_readv(struct file *file,
|
|
const struct compat_iovec __user *vec,
|
|
unsigned long vlen, loff_t *pos)
|
|
{
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (!(file->f_mode & FMODE_READ))
|
|
goto out;
|
|
|
|
ret = -EINVAL;
|
|
if (!(file->f_mode & FMODE_CAN_READ))
|
|
goto out;
|
|
|
|
ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
|
|
|
|
out:
|
|
if (ret > 0)
|
|
add_rchar(current, ret);
|
|
inc_syscr(current);
|
|
return ret;
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
|
|
const struct compat_iovec __user *,vec,
|
|
compat_ulong_t, vlen)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret;
|
|
loff_t pos;
|
|
|
|
if (!f.file)
|
|
return -EBADF;
|
|
pos = f.file->f_pos;
|
|
ret = compat_readv(f.file, vec, vlen, &pos);
|
|
if (ret >= 0)
|
|
f.file->f_pos = pos;
|
|
fdput_pos(f);
|
|
return ret;
|
|
}
|
|
|
|
static long __compat_sys_preadv64(unsigned long fd,
|
|
const struct compat_iovec __user *vec,
|
|
unsigned long vlen, loff_t pos)
|
|
{
|
|
struct fd f;
|
|
ssize_t ret;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
f = fdget(fd);
|
|
if (!f.file)
|
|
return -EBADF;
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PREAD)
|
|
ret = compat_readv(f.file, vec, vlen, &pos);
|
|
fdput(f);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
|
|
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
|
|
const struct compat_iovec __user *,vec,
|
|
unsigned long, vlen, loff_t, pos)
|
|
{
|
|
return __compat_sys_preadv64(fd, vec, vlen, pos);
|
|
}
|
|
#endif
|
|
|
|
COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
|
|
const struct compat_iovec __user *,vec,
|
|
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
|
|
{
|
|
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
|
|
|
|
return __compat_sys_preadv64(fd, vec, vlen, pos);
|
|
}
|
|
|
|
static size_t compat_writev(struct file *file,
|
|
const struct compat_iovec __user *vec,
|
|
unsigned long vlen, loff_t *pos)
|
|
{
|
|
ssize_t ret = -EBADF;
|
|
|
|
if (!(file->f_mode & FMODE_WRITE))
|
|
goto out;
|
|
|
|
ret = -EINVAL;
|
|
if (!(file->f_mode & FMODE_CAN_WRITE))
|
|
goto out;
|
|
|
|
ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
|
|
|
|
out:
|
|
if (ret > 0)
|
|
add_wchar(current, ret);
|
|
inc_syscw(current);
|
|
return ret;
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
|
|
const struct compat_iovec __user *, vec,
|
|
compat_ulong_t, vlen)
|
|
{
|
|
struct fd f = fdget_pos(fd);
|
|
ssize_t ret;
|
|
loff_t pos;
|
|
|
|
if (!f.file)
|
|
return -EBADF;
|
|
pos = f.file->f_pos;
|
|
ret = compat_writev(f.file, vec, vlen, &pos);
|
|
if (ret >= 0)
|
|
f.file->f_pos = pos;
|
|
fdput_pos(f);
|
|
return ret;
|
|
}
|
|
|
|
static long __compat_sys_pwritev64(unsigned long fd,
|
|
const struct compat_iovec __user *vec,
|
|
unsigned long vlen, loff_t pos)
|
|
{
|
|
struct fd f;
|
|
ssize_t ret;
|
|
|
|
if (pos < 0)
|
|
return -EINVAL;
|
|
f = fdget(fd);
|
|
if (!f.file)
|
|
return -EBADF;
|
|
ret = -ESPIPE;
|
|
if (f.file->f_mode & FMODE_PWRITE)
|
|
ret = compat_writev(f.file, vec, vlen, &pos);
|
|
fdput(f);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
|
|
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
|
|
const struct compat_iovec __user *,vec,
|
|
unsigned long, vlen, loff_t, pos)
|
|
{
|
|
return __compat_sys_pwritev64(fd, vec, vlen, pos);
|
|
}
|
|
#endif
|
|
|
|
COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
|
|
const struct compat_iovec __user *,vec,
|
|
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
|
|
{
|
|
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
|
|
|
|
return __compat_sys_pwritev64(fd, vec, vlen, pos);
|
|
}
|
|
#endif
|
|
|
|
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
|
|
size_t count, loff_t max)
|
|
{
|
|
struct fd in, out;
|
|
struct inode *in_inode, *out_inode;
|
|
loff_t pos;
|
|
loff_t out_pos;
|
|
ssize_t retval;
|
|
int fl;
|
|
|
|
/*
|
|
* Get input file, and verify that it is ok..
|
|
*/
|
|
retval = -EBADF;
|
|
in = fdget(in_fd);
|
|
if (!in.file)
|
|
goto out;
|
|
if (!(in.file->f_mode & FMODE_READ))
|
|
goto fput_in;
|
|
retval = -ESPIPE;
|
|
if (!ppos) {
|
|
pos = in.file->f_pos;
|
|
} else {
|
|
pos = *ppos;
|
|
if (!(in.file->f_mode & FMODE_PREAD))
|
|
goto fput_in;
|
|
}
|
|
retval = rw_verify_area(READ, in.file, &pos, count);
|
|
if (retval < 0)
|
|
goto fput_in;
|
|
count = retval;
|
|
|
|
/*
|
|
* Get output file, and verify that it is ok..
|
|
*/
|
|
retval = -EBADF;
|
|
out = fdget(out_fd);
|
|
if (!out.file)
|
|
goto fput_in;
|
|
if (!(out.file->f_mode & FMODE_WRITE))
|
|
goto fput_out;
|
|
retval = -EINVAL;
|
|
in_inode = file_inode(in.file);
|
|
out_inode = file_inode(out.file);
|
|
out_pos = out.file->f_pos;
|
|
retval = rw_verify_area(WRITE, out.file, &out_pos, count);
|
|
if (retval < 0)
|
|
goto fput_out;
|
|
count = retval;
|
|
|
|
if (!max)
|
|
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
|
|
|
|
if (unlikely(pos + count > max)) {
|
|
retval = -EOVERFLOW;
|
|
if (pos >= max)
|
|
goto fput_out;
|
|
count = max - pos;
|
|
}
|
|
|
|
fl = 0;
|
|
#if 0
|
|
/*
|
|
* We need to debate whether we can enable this or not. The
|
|
* man page documents EAGAIN return for the output at least,
|
|
* and the application is arguably buggy if it doesn't expect
|
|
* EAGAIN on a non-blocking file descriptor.
|
|
*/
|
|
if (in.file->f_flags & O_NONBLOCK)
|
|
fl = SPLICE_F_NONBLOCK;
|
|
#endif
|
|
file_start_write(out.file);
|
|
retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
|
|
file_end_write(out.file);
|
|
|
|
if (retval > 0) {
|
|
add_rchar(current, retval);
|
|
add_wchar(current, retval);
|
|
fsnotify_access(in.file);
|
|
fsnotify_modify(out.file);
|
|
out.file->f_pos = out_pos;
|
|
if (ppos)
|
|
*ppos = pos;
|
|
else
|
|
in.file->f_pos = pos;
|
|
}
|
|
|
|
inc_syscr(current);
|
|
inc_syscw(current);
|
|
if (pos > max)
|
|
retval = -EOVERFLOW;
|
|
|
|
fput_out:
|
|
fdput(out);
|
|
fput_in:
|
|
fdput(in);
|
|
out:
|
|
return retval;
|
|
}
|
|
|
|
SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
|
|
{
|
|
loff_t pos;
|
|
off_t off;
|
|
ssize_t ret;
|
|
|
|
if (offset) {
|
|
if (unlikely(get_user(off, offset)))
|
|
return -EFAULT;
|
|
pos = off;
|
|
ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
|
|
if (unlikely(put_user(pos, offset)))
|
|
return -EFAULT;
|
|
return ret;
|
|
}
|
|
|
|
return do_sendfile(out_fd, in_fd, NULL, count, 0);
|
|
}
|
|
|
|
SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
|
|
{
|
|
loff_t pos;
|
|
ssize_t ret;
|
|
|
|
if (offset) {
|
|
if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
|
|
return -EFAULT;
|
|
ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
|
|
if (unlikely(put_user(pos, offset)))
|
|
return -EFAULT;
|
|
return ret;
|
|
}
|
|
|
|
return do_sendfile(out_fd, in_fd, NULL, count, 0);
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
|
|
compat_off_t __user *, offset, compat_size_t, count)
|
|
{
|
|
loff_t pos;
|
|
off_t off;
|
|
ssize_t ret;
|
|
|
|
if (offset) {
|
|
if (unlikely(get_user(off, offset)))
|
|
return -EFAULT;
|
|
pos = off;
|
|
ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
|
|
if (unlikely(put_user(pos, offset)))
|
|
return -EFAULT;
|
|
return ret;
|
|
}
|
|
|
|
return do_sendfile(out_fd, in_fd, NULL, count, 0);
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
|
|
compat_loff_t __user *, offset, compat_size_t, count)
|
|
{
|
|
loff_t pos;
|
|
ssize_t ret;
|
|
|
|
if (offset) {
|
|
if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
|
|
return -EFAULT;
|
|
ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
|
|
if (unlikely(put_user(pos, offset)))
|
|
return -EFAULT;
|
|
return ret;
|
|
}
|
|
|
|
return do_sendfile(out_fd, in_fd, NULL, count, 0);
|
|
}
|
|
#endif
|