Commit 2fb18b5d authored by Alexey Min's avatar Alexey Min 🐧 Committed by Oliver Smith

samsung-klte: backport memfd_create() syscall (!479)

This brings in several patches needed to add support for a
memfd_create() syscall into kernel version 3.4 from kernel
version 3.17. This is required for running lxc >= 3.1.0-r1
with security patch that fixes CVE-2019-5736.

In short, security issue was: in a privileged container root
process could overwrite lxc-start executable by opening its
file descriptor and rewriting executable contents. This is
where memfd comes to help: you can create an in-memory file,
copy your executable there, and place a set of SEALS to protect
it from modifying at a deep level. Then you fexecve() that fd
and you're safe.

For example, pulseaudio also can benefit from having
memfd_create() implemented.

This backports the following commits from upstream linux:
 - dd37978c50bc8b354e5c4633f69387f16572fdac: cache the value
   of file_inode() in struct file

   commit from linux-3.10 to have an f_inode member inside
   struct file and a helper function file_inode() that is
   used in some of the following commits

 - 40e041a2c858b3caefc757e26cb85bfceae5062b shm: add sealing API

   from 3.17: security measure called SEALS, that you can put
   on memfd file to restrict operations on it

 - 9183df25fe7b194563db3fec6dc3202a5855839c shm: add memfd_create()
   syscall

   also from 3.17

 - 503e6636b6f96056210062be703356f4253b6db9 asm-generic: add
   memfd_create system call to unistd.h

 - e57e41931134e09fc6c03c8d4eb19d516cc6e59b ARM: wire up
   memfd_create syscall

The last two are needed to make the syscall visible/usable from
userspace, one in generic context, other for ARM arch.

The test program (https://github.com/minlexx/test_memfd/) was
written to verify that this works.

[ci:skip-build]: already built successfully in CI
parent 17ab67d2
Pipeline #70165131 passed with stages
in 2 minutes and 29 seconds
From 71846fb0e72ac8f7c05ca85f28b50cae4703acfc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Jul 2019 23:47:19 +0300
Subject: [PATCH 1/5] cache the value of file_inode() in struct file
Note that this thing does *not* contribute to inode refcount;
it's pinned down by dentry.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/f2fs/f2fs.h | 3 ++-
fs/file_table.c | 2 ++
fs/open.c | 2 ++
include/linux/fs.h | 7 +++++++
4 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6ef8c890a57..2b511f8a416 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1844,10 +1844,11 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
}
+/* Implementation of file_inode added to include/linux/fs.h
static inline struct inode *file_inode(struct file *f)
{
return f->f_path.dentry->d_inode;
-}
+}*/
static inline bool is_dot_dotdot(const struct qstr *str)
{
diff --git a/fs/file_table.c b/fs/file_table.c
index a01710a6ff3..07838af0737 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -172,6 +172,7 @@ struct file *alloc_file(struct path *path, fmode_t mode,
return NULL;
file->f_path = *path;
+ file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
@@ -254,6 +255,7 @@ static void __fput(struct file *file)
drop_file_write_access(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
+ file->f_inode = NULL;
file_free(file);
dput(dentry);
mntput(mnt);
diff --git a/fs/open.c b/fs/open.c
index 3f716e9b896..2388e788e10 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -673,6 +673,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
f->f_mode = FMODE_PATH;
inode = dentry->d_inode;
+ f->f_inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, mnt);
if (error)
@@ -748,6 +749,7 @@ cleanup_all:
}
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
+ f->f_inode = NULL;
cleanup_file:
put_filp(f);
dput(dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef6e82ce1c1..11265501da0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1029,6 +1029,7 @@ struct file {
struct path f_path;
#define f_dentry f_path.dentry
#define f_vfsmnt f_path.mnt
+ struct inode *f_inode; /* cached value */
const struct file_operations *f_op;
/*
@@ -2304,6 +2305,12 @@ static inline bool execute_ok(struct inode *inode)
return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}
+static inline struct inode *file_inode(struct file *f)
+{
+ /* return f->f_path.dentry->d_inode; / can also use this */
+ return f->f_inode;
+}
+
/*
* get_write_access() gets write permission for a file.
* put_write_access() releases this write permission.
--
2.20.1
From 893282e935b9b1eb212eaf556843a47069e7d2a6 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Mon, 1 Jul 2019 00:54:54 +0300
Subject: [PATCH 2/5] shm: add sealing API
If two processes share a common memory region, they usually want some
guarantees to allow safe access. This often includes:
- one side cannot overwrite data while the other reads it
- one side cannot shrink the buffer while the other accesses it
- one side cannot grow the buffer beyond previously set boundaries
If there is a trust-relationship between both parties, there is no need
for policy enforcement. However, if there's no trust relationship (eg.,
for general-purpose IPC) sharing memory-regions is highly fragile and
often not possible without local copies. Look at the following two
use-cases:
1) A graphics client wants to share its rendering-buffer with a
graphics-server. The memory-region is allocated by the client for
read/write access and a second FD is passed to the server. While
scanning out from the memory region, the server has no guarantee that
the client doesn't shrink the buffer at any time, requiring rather
cumbersome SIGBUS handling.
2) A process wants to perform an RPC on another process. To avoid huge
bandwidth consumption, zero-copy is preferred. After a message is
assembled in-memory and a FD is passed to the remote side, both sides
want to be sure that neither modifies this shared copy, anymore. The
source may have put sensible data into the message without a separate
copy and the target may want to parse the message inline, to avoid a
local copy.
While SIGBUS handling, POSIX mandatory locking and MAP_DENYWRITE provide
ways to achieve most of this, the first one is unproportionally ugly to
use in libraries and the latter two are broken/racy or even disabled due
to denial of service attacks.
This patch introduces the concept of SEALING. If you seal a file, a
specific set of operations is blocked on that file forever. Unlike locks,
seals can only be set, never removed. Hence, once you verified a specific
set of seals is set, you're guaranteed that no-one can perform the blocked
operations on this file, anymore.
An initial set of SEALS is introduced by this patch:
- SHRINK: If SEAL_SHRINK is set, the file in question cannot be reduced
in size. This affects ftruncate() and open(O_TRUNC).
- GROW: If SEAL_GROW is set, the file in question cannot be increased
in size. This affects ftruncate(), fallocate() and write().
- WRITE: If SEAL_WRITE is set, no write operations (besides resizing)
are possible. This affects fallocate(PUNCH_HOLE), mmap() and
write().
- SEAL: If SEAL_SEAL is set, no further seals can be added to a file.
This basically prevents the F_ADD_SEAL operation on a file and
can be set to prevent others from adding further seals that you
don't want.
The described use-cases can easily use these seals to provide safe use
without any trust-relationship:
1) The graphics server can verify that a passed file-descriptor has
SEAL_SHRINK set. This allows safe scanout, while the client is
allowed to increase buffer size for window-resizing on-the-fly.
Concurrent writes are explicitly allowed.
2) For general-purpose IPC, both processes can verify that SEAL_SHRINK,
SEAL_GROW and SEAL_WRITE are set. This guarantees that neither
process can modify the data while the other side parses it.
Furthermore, it guarantees that even with writable FDs passed to the
peer, it cannot increase the size to hit memory-limits of the source
process (in case the file-storage is accounted to the source).
The new API is an extension to fcntl(), adding two new commands:
F_GET_SEALS: Return a bitset describing the seals on the file. This
can be called on any FD if the underlying file supports
sealing.
F_ADD_SEALS: Change the seals of a given file. This requires WRITE
access to the file and F_SEAL_SEAL may not already be set.
Furthermore, the underlying file must support sealing and
there may not be any existing shared mapping of that file.
Otherwise, EBADF/EPERM is returned.
The given seals are _added_ to the existing set of seals
on the file. You cannot remove seals again.
The fcntl() handler is currently specific to shmem and disabled on all
files. A file needs to explicitly support sealing for this interface to
work. A separate syscall is added in a follow-up, which creates files that
support sealing. There is no intention to support this on other
file-systems. Semantics are unclear for non-volatile files and we lack any
use-case right now. Therefore, the implementation is specific to shmem.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
fs/fcntl.c | 5 ++
include/linux/fcntl.h | 16 +++++
include/linux/shmem_fs.h | 18 ++++++
mm/shmem.c | 122 +++++++++++++++++++++++++++++++++++++++
4 files changed, 161 insertions(+)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index b17b568001e..e660ffc9ac8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -20,6 +20,7 @@
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
+#include <linux/shmem_fs.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -431,6 +432,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
+ case F_ADD_SEALS:
+ case F_GET_SEALS:
+ err = shmem_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index f550f894ba1..2c39b178797 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -27,6 +27,22 @@
#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+/*
+ * Set/Get seals
+ */
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+/*
+ * Types of seals
+ */
+#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#define F_SEAL_GROW 0x0004 /* prevent file from growing */
+#define F_SEAL_WRITE 0x0008 /* prevent writes */
+/* (1U << 31) is reserved for signed error codes */
+
+
/*
* Types of directory notifications that may be requested.
*/
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 79ab2555b3b..c08c0907bf5 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -1,6 +1,7 @@
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H
+#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
@@ -11,6 +12,7 @@
struct shmem_inode_info {
spinlock_t lock;
unsigned long flags;
+ unsigned int seals; /* shmem seals */
unsigned long alloced; /* data pages alloced to file */
union {
unsigned long swapped; /* subtotal assigned to swap */
@@ -61,4 +63,20 @@ static inline struct page *shmem_read_mapping_page(
mapping_gfp_mask(mapping));
}
+#ifdef CONFIG_TMPFS
+
+extern int shmem_add_seals(struct file *file, unsigned int seals);
+extern int shmem_get_seals(struct file *file);
+extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+
+#else
+
+static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
+{
+ return -EINVAL;
+}
+
+#endif
+
+
#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 788a30082a9..1a232f8d8cb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -63,6 +63,7 @@ static struct vfsmount *shm_mnt;
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
+#include <linux/fcntl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -560,6 +561,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
int error;
error = inode_change_ok(inode, attr);
@@ -570,6 +572,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
+ /* protected by i_mutex */
+ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
+ (newsize > oldsize && (info->seals & F_SEAL_GROW)))
+ return -EPERM;
+
if (newsize != oldsize) {
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -1279,6 +1286,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
INIT_LIST_HEAD(&info->xattr_list);
@@ -1332,7 +1340,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+ /* i_mutex is held by caller */
+ if (unlikely(info->seals)) {
+ if (info->seals & F_SEAL_WRITE)
+ return -EPERM;
+ if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
+ return -EPERM;
+ }
+
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@@ -1605,6 +1623,110 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
+static int shmem_wait_for_pins(struct address_space *mapping)
+{
+ return 0;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+int shmem_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a shmem-file but restrict
+ * access to a specific subset of file operations. Seals can only be
+ * added, but never removed. This way, mutually untrusted parties can
+ * share common memory regions with a well-defined policy. A malicious
+ * peer can thus never perform unwanted operations on a shared object.
+ *
+ * Seals are only supported on special shmem-files and always affect
+ * the whole underlying inode. Once a seal is set, it may prevent some
+ * kinds of access to the file. Currently, the following seals are
+ * defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous shmem files support sealing. More importantly, seals are
+ * never written to disk. Therefore, there's no plan to support it on
+ * other file types.
+ */
+
+ if (file->f_op != &shmem_file_operations)
+ return -EINVAL;
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (info->seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ /* TODO: this is the place to actually apply seals to
+ * file->f_mapping, but this was not backported yet */
+
+ info->seals |= seals;
+ error = 0;
+
+unlock:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(shmem_add_seals);
+
+int shmem_get_seals(struct file *file)
+{
+ if (file->f_op != &shmem_file_operations)
+ return -EINVAL;
+
+ return SHMEM_I(file_inode(file))->seals;
+}
+EXPORT_SYMBOL_GPL(shmem_get_seals);
+
+long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = shmem_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = shmem_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
--
2.20.1
From 01fc0d7a8df67bd3a1f2cf7ffd5f74c6fadd7c85 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Tue, 2 Jul 2019 02:36:29 +0300
Subject: [PATCH 3/5] shm: add memfd_create() syscall
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It can support sealing and avoids any
connection to user-visible mount-points. Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.
memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode. Also calls like fstat() will
return proper information and mark the file as regular file. If you want
sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not
supported (like on all other regular files).
Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit. It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
arch/x86/syscalls/syscall_32.tbl | 2 +
arch/x86/syscalls/syscall_64.tbl | 2 +
include/linux/syscalls.h | 1 +
include/uapi/linux/memfd.h | 8 ++++
kernel/sys_ni.c | 1 +
mm/shmem.c | 74 ++++++++++++++++++++++++++++++++
6 files changed, 88 insertions(+)
create mode 100644 include/uapi/linux/memfd.h
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 64e55260fbc..498d12b9af8 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -356,3 +356,5 @@
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
354 i386 seccomp sys_seccomp
+# 355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 33335d11d7b..4887a0fa042 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -319,6 +319,8 @@
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
317 common seccomp sys_seccomp
+# 318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index da352d5a271..c1cc0e12cf4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -813,6 +813,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
+asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
new file mode 100644
index 00000000000..9a772654307
--- /dev/null
+++ b/include/uapi/linux/memfd.h
@@ -0,0 +1,8 @@
+#ifndef _UAPI_LINUX_MEMFD_H
+#define _UAPI_LINUX_MEMFD_H
+
+/* flags for memfd_create(2) (unsigned int) */
+#define MFD_CLOEXEC 0x0001U
+#define MFD_ALLOW_SEALING 0x0002U
+
+#endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 026f30a8985..b97ab3a6dc9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -191,6 +191,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1a232f8d8cb..3b9580b0808 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -63,7 +63,9 @@ static struct vfsmount *shm_mnt;
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
+#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -2492,6 +2494,78 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
shmem_show_mpol(seq, sbinfo->mpol);
return 0;
}
+
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ struct shmem_inode_info *info;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ info = SHMEM_I(file_inode(file));
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+ if (flags & MFD_ALLOW_SEALING)
+ info->seals &= ~F_SEAL_SEAL;
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
+
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
--
2.20.1
From b307cbc85399f0aeee3747e1ace50ca036f77c96 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 2 Jul 2019 02:47:26 +0300
Subject: [PATCH 4/5] asm-generic: add memfd_create system call to unistd.h
Commit 9183df25fe7b ("shm: add memfd_create() syscall") added a new
system call (memfd_create) but didn't update the asm-generic unistd
header.
This patch adds the new system call to the asm-generic version of
unistd.h so that it can be used by architectures such as arm64.
Cc: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
include/asm-generic/unistd.h | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index ae8513b32af..a4fafc379f5 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -693,9 +693,13 @@ __SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
compat_sys_process_vm_writev)
#define __NR_seccomp 277
__SYSCALL(__NR_seccomp, sys_seccomp)
+/*#define __NR_getrandom 278
+__SYSCALL(__NR_getrandom, sys_getrandom) unfortunately getrandom is not backported yet */
+#define __NR_memfd_create 279
+__SYSCALL(__NR_memfd_create, sys_memfd_create)
#undef __NR_syscalls
-#define __NR_syscalls 278
+#define __NR_syscalls 280
/*
* All syscalls below here should go away really,
--
2.20.1
From c4dd412e68ccbe2e759d6517e902a6f3e7d8f700 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>