gentoo-overlay/sys-kernel/hardened-kernel/files/linux-5.10/1190-reiser4-v5.patch

97099 lines
2.8 MiB

diff -urN --no-dereference linux-5.10.2.orig/Documentation/filesystems/reiser4.txt linux-5.10.2/Documentation/filesystems/reiser4.txt
--- linux-5.10.2.orig/Documentation/filesystems/reiser4.txt 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/Documentation/filesystems/reiser4.txt 2020-12-23 16:07:46.109812997 +0100
@@ -0,0 +1,75 @@
+Reiser4 filesystem
+==================
+Reiser4 is a file system based on dancing tree algorithms, and is
+described at http://www.namesys.com
+
+
+References
+==========
+web page http://namesys.com/v4/v4.html
+source code ftp://ftp.namesys.com/pub/reiser4-for-2.6/
+userland tools ftp://ftp.namesys.com/pub/reiser4progs/
+install page http://www.namesys.com/install_v4.html
+
+Compile options
+===============
+Enable reiser4 debug mode
+ This checks everything imaginable while reiser4
+ runs
+
+Mount options
+=============
+tmgr.atom_max_size=N
+ Atoms containing more than N blocks will be forced to commit.
+ N is decimal.
+ Default is nr_free_pagecache_pages() / 2 at mount time.
+
+tmgr.atom_max_age=N
+ Atoms older than N seconds will be forced to commit. N is decimal.
+ Default is 600.
+
+tmgr.atom_max_flushers=N
+ Limit of concurrent flushers for one atom. 0 means no limit.
+ Default is 0.
+
+tree.cbk_cache.nr_slots=N
+ Number of slots in the cbk cache.
+
+flush.relocate_threshold=N
+ If flush finds more than N adjacent dirty leaf-level blocks it
+ will force them to be relocated.
+ Default is 64.
+
+flush.relocate_distance=N
+ If flush finds can find a block allocation closer than at most
+ N from the preceder it will relocate to that position.
+ Default is 64.
+
+flush.scan_maxnodes=N
+ The maximum number of nodes to scan left on a level during
+ flush.
+ Default is 10000.
+
+optimal_io_size=N
+ Preferred IO size. This value is used to set st_blksize of
+ struct stat.
+ Default is 65536.
+
+bsdgroups
+ Turn on BSD-style gid assignment.
+
+32bittimes
+ By default file in reiser4 have 64 bit timestamps. Files
+ created when filesystem is mounted with 32bittimes mount
+ option will get 32 bit timestamps.
+
+mtflush
+ Turn off concurrent flushing.
+
+nopseudo
+ Disable pseudo files support. See
+ http://namesys.com/v4/pseudo.html for more about pseudo files.
+
+dont_load_bitmap
+ Don't load all bitmap blocks at mount time, it is useful for
+ machines with tiny RAM and large disks.
diff -urN --no-dereference linux-5.10.2.orig/Documentation/process/changes.rst linux-5.10.2/Documentation/process/changes.rst
--- linux-5.10.2.orig/Documentation/process/changes.rst 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/Documentation/process/changes.rst 2020-12-23 16:07:46.109812997 +0100
@@ -204,6 +204,13 @@
versions of ``mkreiserfs``, ``resize_reiserfs``, ``debugreiserfs`` and
``reiserfsck``. These utils work on both i386 and alpha platforms.
+Reiser4progs
+------------
+
+The reiser4progs package contains utilities for the reiser4 file system.
+Detailed instructions are provided in the README file located at:
+<https://github.com/edward6/reiser4progs>.
+
Xfsprogs
--------
@@ -411,6 +418,11 @@
- <https://git.kernel.org/pub/scm/linux/kernel/git/jeffm/reiserfsprogs.git/>
+Reiser4progs
+------------
+
+- <http://sourceforge.net/projects/reiser4/>
+
Xfsprogs
--------
diff -urN --no-dereference linux-5.10.2.orig/fs/fs-writeback.c linux-5.10.2/fs/fs-writeback.c
--- linux-5.10.2.orig/fs/fs-writeback.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/fs/fs-writeback.c 2020-12-23 16:08:55.163816600 +0100
@@ -37,25 +37,6 @@
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
/*
- * Passed into wb_writeback(), essentially a subset of writeback_control
- */
-struct wb_writeback_work {
- long nr_pages;
- struct super_block *sb;
- enum writeback_sync_modes sync_mode;
- unsigned int tagged_writepages:1;
- unsigned int for_kupdate:1;
- unsigned int range_cyclic:1;
- unsigned int for_background:1;
- unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
- unsigned int auto_free:1; /* free on completion */
- enum wb_reason reason; /* why was writeback initiated? */
-
- struct list_head list; /* pending work list */
- struct wb_completion *done; /* set if the caller waits */
-};
-
-/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
@@ -1626,20 +1607,12 @@
* unlock and relock that for each inode it ends up doing
* IO for.
*/
-static long writeback_sb_inodes(struct super_block *sb,
- struct bdi_writeback *wb,
- struct wb_writeback_work *work)
+long generic_writeback_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb,
+ struct writeback_control *wbc,
+ struct wb_writeback_work *work,
+ bool flush_all)
{
- struct writeback_control wbc = {
- .sync_mode = work->sync_mode,
- .tagged_writepages = work->tagged_writepages,
- .for_kupdate = work->for_kupdate,
- .for_background = work->for_background,
- .for_sync = work->for_sync,
- .range_cyclic = work->range_cyclic,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
unsigned long start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */
@@ -1678,7 +1651,7 @@
spin_unlock(&inode->i_lock);
continue;
}
- if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+ if ((inode->i_state & I_SYNC) && wbc->sync_mode != WB_SYNC_ALL) {
/*
* If this inode is locked for writeback and we are not
* doing writeback-for-data-integrity, move it to
@@ -1708,21 +1681,21 @@
continue;
}
inode->i_state |= I_SYNC;
- wbc_attach_and_unlock_inode(&wbc, inode);
+ wbc_attach_and_unlock_inode(wbc, inode);
write_chunk = writeback_chunk_size(wb, work);
- wbc.nr_to_write = write_chunk;
- wbc.pages_skipped = 0;
+ wbc->nr_to_write = write_chunk;
+ wbc->pages_skipped = 0;
/*
* We use I_SYNC to pin the inode in memory. While it is set
* evict_inode() will wait so the inode cannot be freed.
*/
- __writeback_single_inode(inode, &wbc);
+ __writeback_single_inode(inode, wbc);
- wbc_detach_inode(&wbc);
- work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wbc_detach_inode(wbc);
+ work->nr_pages -= write_chunk - wbc->nr_to_write;
+ wrote += write_chunk - wbc->nr_to_write;
if (need_resched()) {
/*
@@ -1745,7 +1718,7 @@
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
- requeue_inode(inode, tmp_wb, &wbc);
+ requeue_inode(inode, tmp_wb, wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1759,7 +1732,7 @@
* background threshold and other termination conditions.
*/
if (wrote) {
- if (time_is_before_jiffies(start_time + HZ / 10UL))
+ if (!flush_all && time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
@@ -1767,6 +1740,26 @@
}
return wrote;
}
+EXPORT_SYMBOL(generic_writeback_sb_inodes);
+
+long writeback_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
+{
+ struct writeback_control wbc = {
+ .sync_mode = work->sync_mode,
+ .tagged_writepages = work->tagged_writepages,
+ .for_kupdate = work->for_kupdate,
+ .for_background = work->for_background,
+ .range_cyclic = work->range_cyclic,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ if (sb->s_op->writeback_inodes)
+ return sb->s_op->writeback_inodes(sb, wb, &wbc, work, false);
+ else
+ return generic_writeback_sb_inodes(sb, wb, &wbc, work, false);
+}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
struct wb_writeback_work *work)
@@ -2056,6 +2049,31 @@
}
/*
+ * This function is for file systems which have their
+ * own means of periodical write-out of old data.
+ * NOTE: inode_lock should be hold.
+ *
+ * Skip a portion of b_io inodes which belong to @sb
+ * and go sequentially in reverse order.
+ */
+void writeback_skip_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb)
+{
+ while (1) {
+ struct inode *inode;
+
+ if (list_empty(&wb->b_io))
+ break;
+ inode = wb_inode(wb->b_io.prev);
+ if (sb != inode->i_sb)
+ break;
+ redirty_tail(inode, wb);
+ }
+}
+EXPORT_SYMBOL(writeback_skip_sb_inodes);
+
+
+/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
diff -urN --no-dereference linux-5.10.2.orig/fs/Kconfig linux-5.10.2/fs/Kconfig
--- linux-5.10.2.orig/fs/Kconfig 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/fs/Kconfig 2020-12-23 16:07:46.112813041 +0100
@@ -31,6 +31,7 @@
default y if EXT4_FS=y
default m if EXT2_FS_XATTR || EXT4_FS
+source "fs/reiser4/Kconfig"
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
diff -urN --no-dereference linux-5.10.2.orig/fs/Makefile linux-5.10.2/fs/Makefile
--- linux-5.10.2.orig/fs/Makefile 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/fs/Makefile 2020-12-23 16:07:46.112813041 +0100
@@ -70,6 +70,7 @@
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
+obj-$(CONFIG_REISER4_FS) += reiser4/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
# ext2 driver, which doesn't know about journalling! Explicitly request ext2
diff -urN --no-dereference linux-5.10.2.orig/fs/read_write.c linux-5.10.2/fs/read_write.c
--- linux-5.10.2.orig/fs/read_write.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/fs/read_write.c 2020-12-23 16:07:46.113813056 +0100
@@ -233,12 +233,11 @@
}
EXPORT_SYMBOL(no_llseek);
-loff_t default_llseek(struct file *file, loff_t offset, int whence)
+loff_t default_llseek_unlocked(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
loff_t retval;
- inode_lock(inode);
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);
@@ -283,9 +282,19 @@
retval = offset;
}
out:
- inode_unlock(inode);
return retval;
}
+EXPORT_SYMBOL(default_llseek_unlocked);
+
+loff_t default_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t retval;
+
+ inode_lock(file_inode(file));
+ retval = default_llseek_unlocked(file, offset, origin);
+ inode_unlock(file_inode(file));
+ return retval;
+}
EXPORT_SYMBOL(default_llseek);
loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
@@ -401,7 +410,7 @@
read_write == READ ? MAY_READ : MAY_WRITE);
}
-static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
@@ -418,6 +427,7 @@
*ppos = kiocb.ki_pos;
return ret;
}
+EXPORT_SYMBOL(new_sync_read);
static int warn_unsupported(struct file *file, const char *op)
{
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/as_ops.c linux-5.10.2/fs/reiser4/as_ops.c
--- linux-5.10.2.orig/fs/reiser4/as_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/as_ops.c 2020-12-23 16:07:46.113813056 +0100
@@ -0,0 +1,348 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Interface to VFS. Reiser4 address_space_operations are defined here. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/file/file.h"
+#include "plugin/security/perm.h"
+#include "plugin/disk_format/disk_format.h"
+#include "plugin/plugin.h"
+#include "plugin/plugin_set.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+#include "entd.h"
+
+#include <linux/profile.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/security.h>
+#include <linux/migrate.h>
+
+/* address space operations */
+
+/**
+ * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
+ * @page: page to be dirtied
+ *
+ * Operation of struct address_space_operations. This implementation is used by
+ * unix and cryptcompress file plugins.
+ *
+ * This is called when reiser4 page gets dirtied outside of reiser4, for
+ * example, when dirty bit is moved from pte to physical page.
+ *
+ * Tags page in the mapping's page tree with special tag so that it is possible
+ * to do all the reiser4 specific work wrt dirty pages (jnode creation,
+ * capturing by an atom) later because it can not be done in the contexts where
+ * set_page_dirty is called.
+ */
+int reiser4_set_page_dirty(struct page *page)
+{
+ /* this page can be unformatted only */
+ assert("vs-1734", (page->mapping &&
+ page->mapping->host &&
+ reiser4_get_super_fake(page->mapping->host->i_sb) !=
+ page->mapping->host &&
+ reiser4_get_cc_fake(page->mapping->host->i_sb) !=
+ page->mapping->host &&
+ reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
+ page->mapping->host));
+ return __set_page_dirty_nobuffers(page);
+}
+
+/* ->invalidatepage method for reiser4 */
+
+/*
+ * this is called for each truncated page from
+ * truncate_inode_pages()->truncate_{complete,partial}_page().
+ *
+ * At the moment of call, page is under lock, and outstanding io (if any) has
+ * completed.
+ */
+
+/**
+ * reiser4_invalidatepage
+ * @page: page to invalidate
+ * @offset: starting offset for partial invalidation
+ *
+ */
+void reiser4_invalidatepage(struct page *page, unsigned int offset, unsigned int length)
+{
+ int ret = 0;
+ int partial_page = (offset || length < PAGE_SIZE);
+ reiser4_context *ctx;
+ struct inode *inode;
+ jnode *node;
+
+ /*
+ * This is called to truncate file's page.
+ *
+ * Originally, reiser4 implemented truncate in a standard way
+ * (vmtruncate() calls ->invalidatepage() on all truncated pages
+ * first, then file system ->truncate() call-back is invoked).
+ *
+ * This lead to the problem when ->invalidatepage() was called on a
+ * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
+ * process. That is, truncate was bypassing transactions. To avoid
+ * this, try_capture_page_to_invalidate() call was added here.
+ *
+ * After many troubles with vmtruncate() based truncate (including
+ * races with flush, tail conversion, etc.) it was re-written in the
+ * top-to-bottom style: items are killed in reiser4_cut_tree_object()
+ * and pages belonging to extent are invalidated in kill_hook_extent().
+ * So probably now additional call to capture is not needed here.
+ */
+
+ assert("nikita-3137", PageLocked(page));
+ assert("nikita-3138", !PageWriteback(page));
+ inode = page->mapping->host;
+
+ /*
+ * ->invalidatepage() should only be called for the unformatted
+ * jnodes. Destruction of all other types of jnodes is performed
+ * separately. But, during some corner cases (like handling errors
+ * during mount) it is simpler to let ->invalidatepage to be called on
+ * them. Check for this, and do nothing.
+ */
+ if (reiser4_get_super_fake(inode->i_sb) == inode)
+ return;
+ if (reiser4_get_cc_fake(inode->i_sb) == inode)
+ return;
+ if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
+ return;
+ assert("vs-1426", PagePrivate(page));
+ assert("vs-1427",
+ page->mapping == jnode_get_mapping(jnode_by_page(page)));
+ assert("", jprivate(page) != NULL);
+ assert("", ergo(inode_file_plugin(inode) !=
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
+ offset == 0));
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return;
+
+ node = jprivate(page);
+ spin_lock_jnode(node);
+ if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) |
+ (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
+ /* there is not need to capture */
+ jref(node);
+ JF_SET(node, JNODE_HEARD_BANSHEE);
+ page_clear_jnode(page, node);
+ reiser4_uncapture_jnode(node);
+ unhash_unformatted_jnode(node);
+ jput(node);
+ reiser4_exit_context(ctx);
+ return;
+ }
+ spin_unlock_jnode(node);
+
+ /* capture page being truncated. */
+ ret = try_capture_page_to_invalidate(page);
+ if (ret != 0)
+ warning("nikita-3141", "Cannot capture: %i", ret);
+
+ if (!partial_page) {
+ /* remove jnode from transaction and detach it from page. */
+ jref(node);
+ JF_SET(node, JNODE_HEARD_BANSHEE);
+ /* page cannot be detached from jnode concurrently, because it
+ * is locked */
+ reiser4_uncapture_page(page);
+
+ /* this detaches page from jnode, so that jdelete will not try
+ * to lock page which is already locked */
+ spin_lock_jnode(node);
+ page_clear_jnode(page, node);
+ spin_unlock_jnode(node);
+ unhash_unformatted_jnode(node);
+
+ jput(node);
+ }
+
+ reiser4_exit_context(ctx);
+}
+
+/* help function called from reiser4_releasepage(). It returns true if jnode
+ * can be detached from its page and page released. */
+int jnode_is_releasable(jnode * node/* node to check */)
+{
+ assert("nikita-2781", node != NULL);
+ assert_spin_locked(&(node->guard));
+ assert_spin_locked(&(node->load));
+
+ /* is some thread is currently using jnode page, later cannot be
+ * detached */
+ if (atomic_read(&node->d_count) != 0)
+ return 0;
+
+ assert("vs-1214", !jnode_is_loaded(node));
+
+ /*
+ * can only release page if real block number is assigned to it. Simple
+ * check for ->atom wouldn't do, because it is possible for node to be
+ * clean, not it atom yet, and still having fake block number. For
+ * example, node just created in jinit_new().
+ */
+ if (reiser4_blocknr_is_fake(jnode_get_block(node)))
+ return 0;
+
+ /*
+ * pages prepared for write can not be released anyway, so avoid
+ * detaching jnode from the page
+ */
+ if (JF_ISSET(node, JNODE_WRITE_PREPARED))
+ return 0;
+
+ /*
+ * dirty jnode cannot be released. It can however be submitted to disk
+ * as part of early flushing, but only after getting flush-prepped.
+ */
+ if (JF_ISSET(node, JNODE_DIRTY))
+ return 0;
+
+ /* overwrite set is only written by log writer. */
+ if (JF_ISSET(node, JNODE_OVRWR))
+ return 0;
+
+ /* jnode is already under writeback */
+ if (JF_ISSET(node, JNODE_WRITEBACK))
+ return 0;
+
+ /* don't flush bitmaps or journal records */
+ if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * ->releasepage method for reiser4
+ *
+ * This is called by VM scanner when it comes across clean page. What we have
+ * to do here is to check whether page can really be released (freed that is)
+ * and if so, detach jnode from it and remove page from the page cache.
+ *
+ * Check for releasability is done by releasable() function.
+ */
+int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
+{
+ jnode *node;
+
+ assert("nikita-2257", PagePrivate(page));
+ assert("nikita-2259", PageLocked(page));
+ assert("nikita-2892", !PageWriteback(page));
+ assert("nikita-3019", reiser4_schedulable());
+
+ /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
+ is not clear what to do in this case. A lot of deadlocks seems be
+ possible. */
+
+ node = jnode_by_page(page);
+ assert("nikita-2258", node != NULL);
+ assert("reiser4-4", page->mapping != NULL);
+ assert("reiser4-5", page->mapping->host != NULL);
+
+ if (PageDirty(page))
+ return 0;
+
+ /* extra page reference is used by reiser4 to protect
+ * jnode<->page link from this ->releasepage(). */
+ if (page_count(page) > 3)
+ return 0;
+
+ /* releasable() needs jnode lock, because it looks at the jnode fields
+ * and we need jload_lock here to avoid races with jload(). */
+ spin_lock_jnode(node);
+ spin_lock(&(node->load));
+ if (jnode_is_releasable(node)) {
+ struct address_space *mapping;
+
+ mapping = page->mapping;
+ jref(node);
+ /* there is no need to synchronize against
+ * jnode_extent_write() here, because pages seen by
+ * jnode_extent_write() are !releasable(). */
+ page_clear_jnode(page, node);
+ spin_unlock(&(node->load));
+ spin_unlock_jnode(node);
+
+ /* we are under memory pressure so release jnode also. */
+ jput(node);
+
+ return 1;
+ } else {
+ spin_unlock(&(node->load));
+ spin_unlock_jnode(node);
+ assert("nikita-3020", reiser4_schedulable());
+ return 0;
+ }
+}
+
+#ifdef CONFIG_MIGRATION
+int reiser4_migratepage(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ /* TODO: implement movable mapping
+ */
+ return -EIO;
+}
+#endif /* CONFIG_MIGRATION */
+
+int reiser4_readpage_dispatch(struct file *file, struct page *page)
+{
+ assert("edward-1533", PageLocked(page));
+ assert("edward-1534", !PageUptodate(page));
+ assert("edward-1535", page->mapping && page->mapping->host);
+
+ return inode_file_plugin(page->mapping->host)->readpage(file, page);
+}
+
+int reiser4_readpages_dispatch(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return inode_file_plugin(mapping->host)->readpages(file, mapping,
+ pages, nr_pages);
+}
+
+int reiser4_writepages_dispatch(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/block_alloc.c linux-5.10.2/fs/reiser4/block_alloc.c
--- linux-5.10.2.orig/fs/reiser4/block_alloc.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/block_alloc.c 2020-12-23 16:07:46.113813056 +0100
@@ -0,0 +1,1395 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "super.h"
+#include "discard.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/spinlock.h>
+
+/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
+
+/* We need to be able to reserve enough disk space to ensure that an atomic
+ operation will have enough disk space to flush (see flush.c and
+ http://namesys.com/v4/v4.html) and commit it once it is started.
+
+ In our design a call for reserving disk space may fail but not an actual
+ block allocation.
+
+ All free blocks, already allocated blocks, and all kinds of reserved blocks
+ are counted in different per-fs block counters.
+
+ A reiser4 super block's set of block counters currently is:
+
+ free -- free blocks,
+ used -- already allocated blocks,
+
+ grabbed -- initially reserved for performing an fs operation, those blocks
+ are taken from free blocks, then grabbed disk space leaks from grabbed
+ blocks counter to other counters like "fake allocated", "flush
+ reserved", "used", the rest of not used grabbed space is returned to
+ free space at the end of fs operation;
+
+ fake allocated -- counts all nodes without real disk block numbers assigned,
+ we have separate accounting for formatted and unformatted
+ nodes (for easier debugging);
+
+ flush reserved -- disk space needed for flushing and committing an atom.
+ Each dirty already allocated block could be written as a
+ part of atom's overwrite set or as a part of atom's
+ relocate set. In both case one additional block is needed,
+ it is used as a wandered block if we do overwrite or as a
+ new location for a relocated block.
+
+ In addition, blocks in some states are counted on per-thread and per-atom
+ basis. A reiser4 context has a counter of blocks grabbed by this transaction
+ and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
+ of each reiser4 context. Each reiser4 atom has a counter of "flush reserved"
+ blocks, which are reserved for flush processing and atom commit, and the sb's
+ counter of flush reserved blocks is a sum of respective counters of each atom
+*/
+
+/* AN EXAMPLE: suppose we insert new item to the reiser4 tree. We estimate
+ number of blocks to grab for most expensive case of balancing when the leaf
+ node we insert new item to gets split and new leaf node is allocated.
+
+ So, we need to grab blocks for
+
+ 1) one block for possible dirtying the node we insert an item to. That block
+ would be used for node relocation at flush time or for allocating of a
+ wandered one, it depends what will be a result (what set, relocate or
+ overwrite the node gets assigned to) of the node processing by the flush
+ algorithm.
+
+ 2) one block for either allocating a new node, or dirtying of right or left
+ clean neighbor, only one case may happen.
+
+ VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying
+ of left neighbor, right neighbor, current node, and creation of new node.
+ Have I forgotten something? email me.
+
+ These grabbed blocks are counted in both reiser4 context "grabbed blocks"
+ counter and in the fs-wide one (both ctx->ctx_grabbed_blocks and
+ sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
+ decremented by 2.
+
+ Suppose both two blocks were spent for dirtying of an already allocated clean
+ node (one block went from "grabbed" to "flush reserved") and for new block
+ allocating (one block went from "grabbed" to "fake allocated formatted").
+
+ Inserting of a child pointer to the parent node caused parent node to be
+ split, the balancing code takes care about this grabbing necessary space
+ immediately by calling reiser4_grab with BA_RESERVED flag set which means
+ "can use the 5% reserved disk space".
+
+ At this moment insertion completes and grabbed blocks (if they were not used)
+ should be returned to the free space counter.
+
+ However the atom life-cycle is not completed. The atom had one "flush
+ reserved" block added by our insertion and the new fake allocated node is
+ counted as a "fake allocated formatted" one. The atom has to be fully
+ processed by flush before commit. Suppose that the flush moved the first,
+ already allocated node to the atom's overwrite list, the new fake allocated
+ node, obviously, went into the atom relocate set. The reiser4 flush
+ allocates the new node using one unit from "fake allocated formatted"
+ counter, the log writer uses one from "flush reserved" for wandered block
+ allocation.
+
+ And, it is not the end. When the wandered block is deallocated after the
+ atom gets fully played (see wander.c for term description), the disk space
+ occupied for it is returned to free blocks. */
+
+/* BLOCK NUMBERS */
+
+/* Any reiser4 node has a block number assigned to it. We use these numbers for
+ indexing in hash tables, so if a block has not yet been assigned a location
+ on disk we need to give it a temporary fake block number.
+
+ Current implementation of reiser4 uses 64-bit integers for block numbers. We
+ use highest bit in 64-bit block number to distinguish fake and real block
+ numbers. So, only 63 bits may be used to addressing of real device
+ blocks. That "fake" block numbers space is divided into subspaces of fake
+ block numbers for data blocks and for shadow (working) bitmap blocks.
+
+ Fake block numbers for data blocks are generated by a cyclic counter, which
+ gets incremented after each real block allocation. We assume that it is
+ impossible to overload this counter during one transaction life. */
+
+/* Initialize a blocknr hint. */
+void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
+{
+ memset(hint, 0, sizeof(reiser4_blocknr_hint));
+}
+
+/* Release any resources of a blocknr hint. */
+void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
+{
+/* No resources should be freed in current blocknr_hint implementation. */
+}
+
+/* see above for explanation of fake block number. */
+/* Audited by: green(2002.06.11) */
+int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
+{
+ /* The reason for not simply returning result of '&' operation is that
+ while return value is (possibly 32bit) int, the reiser4_block_nr is
+ at least 64 bits long, and high bit (which is the only possible
+ non zero bit after the masking) would be stripped off */
+ return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
+}
+
+__u64 ctx_subvol_grabbed(reiser4_context *ctx, __u32 subv_id)
+{
+ struct ctx_brick_info *cbi;
+
+ cbi = find_context_brick_info(ctx, subv_id);
+ return cbi != NULL ? cbi->grabbed_blocks : 0;
+}
+
+/*
+ * Static functions for <reiser4 super block>/<reiser4 context>
+ * block counters arithmetic. Mostly, they are isolated to not
+ * to code same assertions in several places
+ */
+
+static inline void sub_from_cbi_grabbed(struct ctx_brick_info *cbi,
+ __u64 count)
+{
+ if (count != 0) {
+ assert("edward-1976", cbi != NULL);
+ assert("zam-527", cbi->grabbed_blocks >= count);
+ BUG_ON(cbi == NULL);
+ BUG_ON(cbi->grabbed_blocks < count);
+
+ cbi->grabbed_blocks -= count;
+ }
+}
+
+static inline void sub_from_ctx_grabbed(reiser4_context *ctx,
+ u64 count, reiser4_subvol *subv)
+{
+ sub_from_cbi_grabbed(find_context_brick_info(ctx, subv->id),
+ count);
+}
+
+static inline void add_to_cbi_grabbed(struct ctx_brick_info *cbi, u64 count)
+{
+ cbi->grabbed_blocks += count;
+}
+
+static void add_to_ctx_grabbed(reiser4_context *ctx,
+ __u64 count, __u32 subv_id)
+{
+ struct ctx_brick_info *cbi;
+
+ cbi = find_context_brick_info(ctx, subv_id);
+ if (cbi == NULL) {
+ assert("edward-1989", reiser4_schedulable());
+
+ cbi = alloc_context_brick_info();
+ BUG_ON(cbi == NULL);
+ init_context_brick_info(cbi, subv_id);
+ insert_context_brick_info(ctx, cbi);
+ }
+ add_to_cbi_grabbed(cbi, count);
+}
+
+static void sub_from_subvol_grabbed(reiser4_subvol *subv, __u64 count)
+{
+ assert("zam-525", subv->blocks_grabbed >= count);
+ subv->blocks_grabbed -= count;
+}
+
+static void sub_from_subvol_flush_reserved(reiser4_subvol *subv,
+ __u64 count)
+{
+ assert("edward-1990", subvol_check_block_counters(subv));
+ assert("vpf-291", subv->blocks_flush_reserved >= count);
+
+ subv->blocks_flush_reserved -= count;
+}
+
+static void sub_from_subvol_fake_allocated(reiser4_subvol *subv,
+ __u64 count, reiser4_ba_flags_t flags)
+{
+ if (flags & BA_FORMATTED) {
+ assert("zam-806", subv->blocks_fake_allocated >= count);
+ subv->blocks_fake_allocated -= count;
+ } else {
+ assert("zam-528",
+ subv->blocks_fake_allocated_unformatted >= count);
+ subv->blocks_fake_allocated_unformatted -= count;
+ }
+}
+
+static void sub_from_subvol_used(reiser4_subvol *subv, __u64 count)
+{
+ assert("zam-530",
+ subv->blocks_used >= count + subv->min_blocks_used);
+
+ subv->blocks_used -= count;
+}
+
+static void sub_from_cluster_reserved(reiser4_subvol *subv, __u64 count)
+{
+ assert("edward-501", subv->blocks_clustered >= count);
+ subv->blocks_clustered -= count;
+}
+
+static inline void add_to_abi_flush_reserved(struct atom_brick_info *abi,
+ u32 count)
+{
+ assert("edward-1991", abi != NULL);
+ BUG_ON(abi == NULL);
+
+ abi->atom_flush_reserved += count;
+}
+
+static void add_to_atom_flush_reserved(txn_atom *atom, __u32 count,
+ __u32 subv_id)
+{
+ assert("zam-772", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ add_to_abi_flush_reserved(find_atom_brick_info(&atom->bricks_info,
+ subv_id), count);
+}
+
+static void sub_from_abi_flush_reserved(struct atom_brick_info *abi,
+ u32 count)
+{
+ assert("edward-1992", abi != NULL);
+ assert("nikita-2790", abi->atom_flush_reserved >= count);
+
+ abi->atom_flush_reserved -= count;
+}
+
+/*
+ * subvolume has 7 counters: free, used, grabbed, fake allocated
+ * (formatted and unformatted), clustered and flush reserved.
+ * Their sum must be number of blocks on a device.
+ */
+int subvol_check_block_counters(const reiser4_subvol *subv)
+{
+ __u64 sum;
+
+ sum =
+ reiser4_subvol_grabbed_blocks(subv) +
+ reiser4_subvol_free_blocks(subv) +
+ reiser4_subvol_used_blocks(subv) +
+ reiser4_subvol_fake_allocated_fmt(subv) +
+ reiser4_subvol_fake_allocated_unf(subv) +
+ reiser4_subvol_flush_reserved(subv) +
+ reiser4_subvol_clustered_blocks(subv);
+
+ if (reiser4_subvol_block_count(subv) != sum) {
+ printk("subvol (%s) block counters: "
+ "used %llu, free %llu, "
+ "grabbed %llu, "
+ "fake allocated (formatetd %llu, unformatted %llu), "
+ "reserved %llu, clustered %llu, "
+ "sum %llu, must be (block count) %llu\n",
+ subv->name,
+ (unsigned long long)reiser4_subvol_used_blocks(subv),
+ (unsigned long long)reiser4_subvol_free_blocks(subv),
+ (unsigned long long)reiser4_subvol_grabbed_blocks(subv),
+ (unsigned long long)reiser4_subvol_fake_allocated_fmt(subv),
+ (unsigned long long)reiser4_subvol_fake_allocated_unf(subv),
+ (unsigned long long)reiser4_subvol_flush_reserved(subv),
+ (unsigned long long)reiser4_subvol_clustered_blocks(subv),
+ (unsigned long long)sum,
+ (unsigned long long)reiser4_subvol_block_count(subv));
+ return 0;
+ }
+ return 1;
+}
+
+/* Adjust "working" free blocks counter for number of blocks we are going to
+ allocate. Record number of grabbed blocks in fs-wide and per-thread
+ counters. This function should be called before bitmap scanning or
+ allocating fake block numbers
+
+ @super -- pointer to reiser4 super block;
+ @count -- number of blocks we reserve;
+
+ @return -- 0 if success, -ENOSPC, if all
+ free blocks are preserved or already allocated.
+*/
+
+static int reiser4_grab(reiser4_context *ctx, __u64 count,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv)
+{
+ __u64 free_blocks;
+ int use_reserved = flags & BA_RESERVED;
+ reiser4_super_info_data *sbinfo;
+
+ assert("vs-1276", ctx == get_current_context());
+ /*
+ * Do not grab anything on ro-mounted fs
+ */
+ if (sb_rdonly(ctx->super)) {
+ ctx->grab_enabled = 0;
+ ctx->ro = 1;
+ return 0;
+ }
+ sbinfo = get_super_private(ctx->super);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ free_blocks = subv->blocks_free;
+
+ if ((use_reserved && free_blocks < count) ||
+ (!use_reserved && free_blocks < count + subv->blocks_reserved)) {
+
+ spin_unlock_reiser4_super(sbinfo);
+ return RETERR(-ENOSPC);
+ }
+ subv->blocks_grabbed += count;
+ subv->blocks_free -= count;
+
+ assert("nikita-2986", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+
+ add_to_ctx_grabbed(ctx, count, subv->id);
+ /*
+ * disable grab space in current context
+ */
+ ctx->grab_enabled = 0;
+ return 0;
+}
+
+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ int ret;
+ reiser4_context *ctx;
+
+ assert("nikita-2964",
+ ergo(flags & BA_CAN_COMMIT,
+ lock_stack_isclean(get_current_lock_stack())));
+ ctx = get_current_context();
+
+ if (!(flags & BA_FORCE) && !is_grab_enabled(ctx))
+ return 0;
+
+ ret = reiser4_grab(ctx, count, flags, subv);
+ if (ret == -ENOSPC) {
+ /*
+ * Try to commit the all transactions
+ * if BA_CAN_COMMIT flag is present
+ */
+ if (flags & BA_CAN_COMMIT) {
+ txnmgr_force_commit_all(ctx->super, 0);
+ ctx->grab_enabled = 1;
+ ret = reiser4_grab(ctx, count, flags, subv);
+ }
+ }
+ /*
+ * allocation from reserved pool cannot fail. This is severe error.
+ */
+ assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
+ return ret;
+}
+
+/**
+ * SPACE RESERVATION FOR UNLINK/TRUNCATE
+ *
+ * Unlink and truncate require space in transaction (to update stat data,
+ * at least). But we don't want rm(1) to fail with "No space on device" error.
+ *
+ * Solution is to reserve 5% of disk space for truncates and unlinks.
+ * Specifically, normal space grabbing requests don't grab space from
+ * reserved area. Only requests with BA_RESERVED bit in flags are allowed
+ * to drain it. Per super block delete mutex is used to allow only one
+ * thread at a time to grab from reserved area.
+ *
+ * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
+ * flag.
+ *
+ * FIXME-EDWARD: Make delete mutex per-subvolume (not per-superblock).
+ */
+int reiser4_grab_reserved(struct super_block *super, __u64 count,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv)
+{
+ int ret;
+ reiser4_super_info_data *sbinfo = get_super_private(super);
+
+ assert("nikita-3175", flags & BA_CAN_COMMIT);
+ /*
+ * Check the delete mutex already taken by us.
+ * We assume that reading of machine word is atomic
+ */
+ if (sbinfo->delete_mutex_owner == current) {
+ ret = reiser4_grab_space(count,
+ (flags | BA_RESERVED) & ~BA_CAN_COMMIT,
+ subv);
+ if (ret) {
+ warning("zam-1003",
+ "nested call of grab_reserved fails count=(%llu)",
+ (unsigned long long)count);
+ reiser4_release_reserved(super);
+ return RETERR(-ENOSPC);
+ }
+ return 0;
+ }
+ /*
+ * first, try to grab space without reservation flag
+ */
+ ret = reiser4_grab_space(count, flags, subv);
+ if (ret) {
+ /*
+ * normal grab failed, so try to grab from reserved area *
+ */
+ mutex_lock(&sbinfo->delete_mutex);
+ assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
+ sbinfo->delete_mutex_owner = current;
+
+ ret = reiser4_grab_space(count, flags | BA_RESERVED, subv);
+ if (ret){
+ warning("zam-833",
+ "reserved space is not enough (%llu)",
+ (unsigned long long)count);
+ reiser4_release_reserved(super);
+ return RETERR(-ENOSPC);
+ }
+ }
+ return 0;
+}
+
+void reiser4_release_reserved(struct super_block *super)
+{
+ reiser4_super_info_data *info;
+
+ info = get_super_private(super);
+ if (info->delete_mutex_owner == current) {
+ info->delete_mutex_owner = NULL;
+ mutex_unlock(&info->delete_mutex);
+ }
+}
+
+static reiser4_super_info_data *grabbed2fake_allocated_head(int count,
+ reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ ctx = get_current_context();
+
+ sub_from_ctx_grabbed(ctx, count, subv);
+
+ sbinfo = get_super_private(ctx->super);
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_grabbed(subv, count);
+ /* return sbinfo locked */
+ return sbinfo;
+}
+
+/* is called after @count fake block numbers are allocated and pointer to
+ those blocks are inserted into tree. */
+static void grabbed2fake_allocated_formatted(reiser4_subvol *subv)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = grabbed2fake_allocated_head(1, subv);
+ subv->blocks_fake_allocated++;
+
+ assert("vs-922", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/**
+ * grabbed2fake_allocated_unformatted
+ * @count:
+ *
+ */
+static void grabbed2fake_allocated_unformatted(int count, reiser4_subvol *subv)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = grabbed2fake_allocated_head(count, subv);
+ subv->blocks_fake_allocated_unformatted += count;
+
+ assert("vs-9221", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+void grabbed2cluster_reserved(int count, reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ ctx = get_current_context();
+
+ sub_from_ctx_grabbed(ctx, count, subv);
+
+ sbinfo = get_super_private(ctx->super);
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_grabbed(subv, count);
+ subv->blocks_clustered += count;
+
+ assert("edward-504", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+void cluster_reserved2grabbed(int count, reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ ctx = get_current_context();
+
+ sbinfo = get_super_private(ctx->super);
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_cluster_reserved(subv, count);
+ subv->blocks_grabbed += count;
+
+ assert("edward-505", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+ add_to_ctx_grabbed(ctx, count, subv->id);
+}
+
+void cluster_reserved2free(int count, reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+
+ cluster_reserved2grabbed(count, subv);
+ grabbed2free(ctx, sbinfo, count, subv);
+}
+
+/*
+ * FIXME-EDWARD: This is per-subvolume thing
+ */
+static DEFINE_SPINLOCK(fake_lock);
+static reiser4_block_nr fake_gen = 0;
+
+/**
+ * assign_fake_blocknr
+ * @blocknr:
+ * @count:
+ *
+ * Obtain a fake block number for new node which will be used to refer to
+ * this newly allocated node until real allocation is done.
+ */
+static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count,
+ reiser4_subvol *subv)
+{
+ spin_lock(&fake_lock);
+ *blocknr = fake_gen;
+ fake_gen += count;
+ spin_unlock(&fake_lock);
+
+ BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
+ /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
+ *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
+ assert("zam-394", zlook(&subv->tree, blocknr) == NULL);
+}
+
+int assign_fake_blocknr_formatted(reiser4_block_nr *blocknr,
+ reiser4_subvol *subv)
+{
+ assign_fake_blocknr(blocknr, 1, subv);
+ grabbed2fake_allocated_formatted(subv);
+ return 0;
+}
+
+/**
+ * fake_blocknrs_unformatted
+ * @count: number of fake numbers to get
+ *
+ * Allocates @count fake block numbers which will be assigned to jnodes
+ */
+reiser4_block_nr fake_blocknr_unformatted(int count, reiser4_subvol *subv)
+{
+ reiser4_block_nr blocknr;
+
+ assign_fake_blocknr(&blocknr, count, subv);
+ grabbed2fake_allocated_unformatted(count, subv);
+
+ return blocknr;
+}
+
+/*
+ * adjust sb block counters, if real (on-disk) block allocation immediately
+ * follows grabbing of free disk space
+ */
+static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv)
+{
+ sub_from_ctx_grabbed(ctx, count, subv);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_grabbed(subv, count);
+ subv->blocks_used += count;
+
+ assert("nikita-2679", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/*
+ * adjust sb block counters when @count unallocated blocks get mapped to disk
+ */
+static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv)
+{
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_fake_allocated(subv, count, flags);
+ subv->blocks_used += count;
+
+ assert("nikita-2680", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+static void flush_reserved2used(txn_atom *atom, u64 count, reiser4_subvol *subv)
+{
+ reiser4_super_info_data *sbinfo;
+ struct atom_brick_info *abi;
+
+ assert("zam-787", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ abi = find_atom_brick_info(&atom->bricks_info, subv->id);
+
+ assert("edward-1993", abi != NULL);
+ assert("edward-1994", abi->brick_id == subv->id);
+ assert("edward-1995",
+ abi->atom_flush_reserved <= subv->blocks_flush_reserved);
+
+ sub_from_abi_flush_reserved(abi, count);
+ sbinfo = get_current_super_private();
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_flush_reserved(subv, count);
+
+ assert("edward-1996",
+ abi->atom_flush_reserved <= subv->blocks_flush_reserved);
+
+ subv->blocks_used += count;
+
+ assert("zam-789", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/* update the per fs blocknr hint default value. */
+void update_blocknr_hint_default(const struct super_block *s,
+ struct reiser4_subvol *subv,
+ const reiser4_block_nr *block)
+{
+ reiser4_super_info_data *sbinfo = get_super_private(s);
+
+ assert("nikita-3342", !reiser4_blocknr_is_fake(block));
+
+ spin_lock_reiser4_super(sbinfo);
+ if (*block < subv->block_count) {
+ subv->blocknr_hint_default = *block;
+ } else {
+ warning("zam-676",
+ "block number %llu is too large to be used in a blocknr hint\n",
+ (unsigned long long)*block);
+ dump_stack();
+ DEBUGON(1);
+ }
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/* get current value of the default blocknr hint. */
+void get_blocknr_hint_default(reiser4_block_nr *result, reiser4_subvol *subv)
+{
+ reiser4_super_info_data *sbinfo = get_current_super_private();
+
+ spin_lock_reiser4_super(sbinfo);
+ *result = subv->blocknr_hint_default;
+ assert("zam-677", *result < subv->block_count);
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/* Allocate "real" disk blocks by calling a proper space allocation plugin
+ * method. Blocks are allocated in one contiguous disk region. The plugin
+ * independent part accounts blocks by subtracting allocated amount from grabbed
+ * or fake block counter and add the same amount to the counter of allocated
+ * blocks.
+ *
+ * @hint -- a reiser4 blocknr hint object which contains further block
+ * allocation hints and parameters (search start, a stage of block
+ * which will be mapped to disk, etc.),
+ * @blk -- an out parameter for the beginning of the allocated region,
+ * @len -- in/out parameter, it should contain the maximum number of allocated
+ * blocks, after block allocation completes, it contains the length of
+ * allocated disk region.
+ * @flags -- see reiser4_ba_flags_t description.
+ *
+ * @return -- 0 if success, error code otherwise.
+ */
+int reiser4_alloc_blocks(reiser4_blocknr_hint *hint, reiser4_block_nr *blk,
+ reiser4_block_nr *len, reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ __u64 needed = *len;
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+ int ret;
+
+ assert("zam-986", hint != NULL);
+ assert("edward-1776", subv != NULL);
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+ /*
+ * For write-optimized data we use default search start value, which is
+ * close to last write location
+ */
+ if (flags & BA_USE_DEFAULT_SEARCH_START)
+ get_blocknr_hint_default(&hint->blk, subv);
+ /*
+ * VITALY: allocator should grab this for internal/tx-lists/similar only
+ */
+ if (hint->block_stage == BLOCK_NOT_COUNTED) {
+ ret = reiser4_grab_space_force(*len, flags, subv);
+ if (ret != 0)
+ return ret;
+ }
+ ret = sa_alloc_blocks(reiser4_get_space_allocator(subv), hint,
+ (int)needed, blk, len, subv);
+ if (!ret) {
+ assert("zam-680",
+ *blk < reiser4_subvol_block_count(subv));
+ assert("zam-681",
+ *blk + *len <= reiser4_subvol_block_count(subv));
+
+ if (flags & BA_PERMANENT) {
+ /*
+ * we assume that current atom exists at this moment
+ */
+ struct atom_brick_info *abi;
+ txn_atom *atom = get_current_atom_locked();
+
+ ret = __check_insert_atom_brick_info(&atom,
+ subv->id, &abi);
+ if (ret)
+ return ret;
+ abi->nr_blocks_allocated += *len;
+ spin_unlock_atom(atom);
+ }
+ switch (hint->block_stage) {
+ case BLOCK_NOT_COUNTED:
+ case BLOCK_GRABBED:
+ grabbed2used(ctx, sbinfo, *len, subv);
+ break;
+ case BLOCK_UNALLOCATED:
+ fake_allocated2used(sbinfo, *len, flags, subv);
+ break;
+ case BLOCK_FLUSH_RESERVED:
+ {
+ txn_atom *atom = get_current_atom_locked();
+ flush_reserved2used(atom, *len, subv);
+ spin_unlock_atom(atom);
+ }
+ break;
+ default:
+ impossible("zam-531", "wrong block stage");
+ }
+ } else {
+ assert("zam-821",
+ ergo(hint->max_dist == 0
+ && !hint->backward, ret != -ENOSPC));
+ if (hint->block_stage == BLOCK_NOT_COUNTED)
+ grabbed2free(ctx, sbinfo, needed, subv);
+ }
+
+ return ret;
+}
+
+/**
+ * ask block allocator for some unformatted blocks
+ */
+void allocate_blocks_unformatted(reiser4_blocknr_hint *preceder,
+ reiser4_block_nr wanted_count,
+ reiser4_block_nr *first_allocated,
+ reiser4_block_nr *allocated,
+ block_stage_t block_stage,
+ reiser4_subvol *subv)
+{
+ *allocated = wanted_count;
+ preceder->max_dist = 0; /* scan whole disk, if needed */
+
+ /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
+ preceder->block_stage = block_stage;
+
+ /* FIXME: we do not handle errors here now */
+ check_me("vs-420", reiser4_alloc_blocks(preceder, first_allocated,
+ allocated, BA_PERMANENT,
+ subv) == 0);
+ /* update flush_pos's preceder to last allocated block number */
+ preceder->blk = *first_allocated + *allocated - 1;
+}
+
+/* used -> fake_allocated -> grabbed -> free */
+
+/*
+ * adjust block counters when @count unallocated blocks get unmapped from disk
+ */
+static void used2fake_allocated(reiser4_super_info_data *sbinfo, __u64 count,
+ int formatted, reiser4_subvol *subv)
+{
+ spin_lock_reiser4_super(sbinfo);
+
+ if (formatted)
+ subv->blocks_fake_allocated += count;
+ else
+ subv->blocks_fake_allocated_unformatted += count;
+
+ sub_from_subvol_used(subv, count);
+
+ assert("nikita-2681", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+static void used2flush_reserved(reiser4_super_info_data *sbinfo, txn_atom *atom,
+ __u64 count, reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ assert("nikita-2791", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ add_to_atom_flush_reserved(atom, (__u32)count, subv->id);
+ spin_lock_reiser4_super(sbinfo);
+
+ subv->blocks_flush_reserved += count;
+ sub_from_subvol_used(subv, count);
+
+ assert("nikita-2681", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/*
+ * disk space, virtually used by fake block numbers is counted as "grabbed" again
+ */
+static void fake_allocated2grabbed(reiser4_context *ctx,
+ reiser4_super_info_data *sbinfo, __u64 count,
+ reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ add_to_ctx_grabbed(ctx, count, subv->id);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ assert("nikita-2682", subvol_check_block_counters(subv));
+
+ subv->blocks_grabbed += count;
+ sub_from_subvol_fake_allocated(subv, count, flags & BA_FORMATTED);
+
+ assert("nikita-2683", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+
+ fake_allocated2grabbed(ctx, sbinfo, count, flags, subv);
+ grabbed2free(ctx, sbinfo, count, subv);
+}
+
+void grabbed2free_mark(__u64 mark, reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+ u64 ctx_grabbed;
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+ ctx_grabbed = ctx_subvol_grabbed(ctx, subv->id);
+
+ assert("nikita-3007", (__s64) mark >= 0);
+ assert("nikita-3006", ctx_grabbed >= mark);
+
+ grabbed2free(ctx, sbinfo, ctx_grabbed - mark, subv);
+}
+
+void __grabbed2free(struct ctx_brick_info *cbi, reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv)
+{
+ assert("edward-1977", cbi != NULL);
+
+ sub_from_cbi_grabbed(cbi, count);
+
+ spin_lock_reiser4_super(sbinfo);
+ sub_from_subvol_grabbed(subv, count);
+ subv->blocks_free += count;
+ assert("nikita-2684", subvol_check_block_counters(subv));
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/**
+ * grabbed2free - adjust grabbed and free block counters
+ * @ctx: context to update grabbed block counter of
+ * @sbinfo: super block to update grabbed and free block counters of
+ * @count: number of blocks to adjust counters by
+ *
+ * Decreases context's and per filesystem's counters of grabbed
+ * blocks. Increases per filesystem's counter of free blocks.
+ */
+void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv)
+{
+ if (count != 0) {
+ struct ctx_brick_info *cbi;
+
+ cbi = find_context_brick_info(ctx, subv->id);
+
+ assert("edward-1997", cbi != NULL);
+
+ __grabbed2free(cbi, sbinfo, count, subv);
+ }
+}
+
+void grabbed2flush_reserved_nolock(txn_atom *atom,
+ __u64 count, reiser4_subvol *subv)
+{
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+
+ assert("vs-1095", atom);
+
+ ctx = get_current_context();
+
+ sbinfo = get_super_private(ctx->super);
+
+ sub_from_ctx_grabbed(ctx, count, subv);
+
+ add_to_atom_flush_reserved(atom, count, subv->id);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ subv->blocks_flush_reserved += count;
+ sub_from_subvol_grabbed(subv, count);
+
+ assert("vpf-292", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+void grabbed2flush_reserved(__u64 count, reiser4_subvol *subv)
+{
+ txn_atom *atom = get_current_atom_locked();
+
+ grabbed2flush_reserved_nolock(atom, count, subv);
+
+ spin_unlock_atom(atom);
+}
+
+void flush_reserved2grabbed(struct atom_brick_info *abi,
+ struct ctx_brick_info *cbi,
+ u64 count, reiser4_subvol *subv)
+{
+ reiser4_super_info_data *sbinfo;
+
+ assert("edward-1998", abi != NULL);
+ assert("edward-1999", cbi != NULL);
+ assert("edward-2000", abi->brick_id == subv->id);
+ assert("edward-2001", cbi->brick_id == subv->id);
+
+ sbinfo = get_current_super_private();
+
+ add_to_cbi_grabbed(cbi, count);
+
+ sub_from_abi_flush_reserved(abi, count);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ sub_from_subvol_flush_reserved(subv, count);
+
+ subv->blocks_grabbed += count;
+
+ assert("vpf-292", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+u64 all_flush_reserved2grabbed(txn_atom *atom)
+{
+ struct rb_root *root;
+ struct rb_node *node;
+ u64 flush_reserved = 0;
+ reiser4_context *ctx = get_current_context();
+
+ spin_lock_atom(atom);
+
+ root = &atom->bricks_info;
+
+ check_atom_flush_reserved(atom);
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct atom_brick_info *abi;
+ struct ctx_brick_info *cbi;
+ reiser4_subvol *subv;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ subv = current_origin(abi->brick_id);
+ /*
+ * make sure that respective context brick info exists
+ */
+ cbi = find_context_brick_info(ctx, abi->brick_id);
+ if (cbi == NULL) {
+#if 0
+ warning("edward-2002",
+ "Context info for brick %d not found.",
+ abi->brick_id);
+#endif
+ spin_unlock_atom(atom);
+ cbi = alloc_context_brick_info();
+ if (!cbi)
+ return -ENOMEM;
+ init_context_brick_info(cbi, abi->brick_id);
+ insert_context_brick_info(ctx, cbi);
+ spin_lock_atom(atom);
+ }
+ flush_reserved += abi->atom_flush_reserved;
+ flush_reserved2grabbed(abi, cbi,
+ abi->atom_flush_reserved, subv);
+ }
+ check_atom_flush_reserved(atom);
+
+ spin_unlock_atom(atom);
+ return flush_reserved;
+}
+
+/**
+ * all_grabbed2free - releases all blocks grabbed in context
+ *
+ * Decreases context's and super block's grabbed block counters by number of
+ * blocks grabbed by current context and increases super block's free block
+ * counter correspondingly.
+ */
+void all_grabbed2free(void)
+{
+ reiser4_context *ctx = get_current_context();
+ struct rb_root *root = &ctx->bricks_info;
+ struct rb_node *node;
+
+ if (!get_current_super_private() || !current_lv_conf())
+ /* volume hasn't been activated */
+ return;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct ctx_brick_info *cbi;
+
+ cbi = rb_entry(node, struct ctx_brick_info, node);
+ if (cbi->grabbed_blocks)
+ __grabbed2free(cbi,
+ get_super_private(ctx->super),
+ cbi->grabbed_blocks,
+ current_origin(cbi->brick_id));
+ }
+}
+
+/*
+ * adjust sb block counters if real (on-disk) blocks do not become unallocated
+ * after freeing, @count blocks become "grabbed"
+ */
+static void used2grabbed(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv)
+{
+ add_to_ctx_grabbed(ctx, count, subv->id);
+
+ spin_lock_reiser4_super(sbinfo);
+
+ subv->blocks_grabbed += count;
+ sub_from_subvol_used(subv, count);
+
+ assert("nikita-2685", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/*
+ * this used to be done through used2grabbed and grabbed2free
+ */
+static void used2free(reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv)
+{
+ spin_lock_reiser4_super(sbinfo);
+
+ subv->blocks_free += count;
+ sub_from_subvol_used(subv, count);
+
+ assert("nikita-2685", subvol_check_block_counters(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+}
+
+/*
+ * check "allocated" state of given block range
+ */
+int reiser4_check_blocks(const reiser4_block_nr *start,
+ const reiser4_block_nr *len, int desired,
+ reiser4_subvol *subv)
+{
+ return sa_check_blocks(start, len, desired, subv);
+}
+
+/* Blocks deallocation function may do an actual deallocation through space
+ plugin allocation or store deleted block numbers in atom's delete_set data
+ structure depend on @defer parameter. */
+
+/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks
+ which will be deleted from WORKING bitmap. They might be just unmapped from
+ disk, or freed but disk space is still grabbed by current thread, or these
+ blocks must not be counted in any reiser4 sb block counters,
+ see block_stage_t comment */
+
+/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
+ distinguish blocks allocated for unformatted and formatted nodes */
+
+/**
+ * Deallocate an extent of @len blocks with the beginning at @start
+ * on the block device associated with @subv.
+ */
+int reiser4_dealloc_blocks(const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ block_stage_t target_stage,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv)
+{
+ txn_atom *atom = NULL;
+ int ret;
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+ void *new_entry = NULL;
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+
+ if (REISER4_DEBUG) {
+ assert("zam-431", *len != 0);
+ assert("zam-432", *start != 0);
+ assert("zam-558", !reiser4_blocknr_is_fake(start));
+
+ spin_lock_reiser4_super(sbinfo);
+ assert("zam-562", *start < reiser4_subvol_block_count(subv));
+ spin_unlock_reiser4_super(sbinfo);
+ }
+
+ if (flags & BA_DEFER) {
+ /*
+ * These blocks will be later deallocated by apply_dset().
+ * It is equivalent to a non-deferred deallocation with target
+ * stage BLOCK_NOT_COUNTED.
+ */
+
+ /* store deleted block numbers in the atom's deferred delete set
+ for further actual deletion */
+ do {
+ struct atom_brick_info *abi;
+
+ atom = get_current_atom_locked();
+ assert("zam-430", atom != NULL);
+
+ ret = __check_insert_atom_brick_info(&atom,
+ subv->id,
+ &abi);
+ if (ret)
+ return ret;
+ ret = atom_dset_deferred_add_extent(atom,
+ &new_entry,
+ start, len,
+ subv->id);
+
+ if (ret == -ENOMEM)
+ return ret;
+
+ /* This loop might spin at most two times */
+ } while (ret == -E_REPEAT);
+
+ assert("zam-477", ret == 0);
+ assert("zam-433", atom != NULL);
+
+ spin_unlock_atom(atom);
+
+ } else {
+ assert("zam-425", get_current_super_private() != NULL);
+ sa_dealloc_blocks(reiser4_get_space_allocator(subv),
+ *start, *len, subv);
+
+ if (flags & BA_PERMANENT) {
+ /*
+ * These blocks were counted as allocated, we have
+ * to revert it back if allocation is discarded.
+ */
+ txn_atom *atom = get_current_atom_locked();
+ struct atom_brick_info *abi;
+
+ abi = find_atom_brick_info(&atom->bricks_info, subv->id);
+ /*
+ * has to be found as we inserted that
+ * item, see reiser4_alloc_blocks(),
+ * case (flags & BA_PERMANENT)
+ */
+ assert("edward-2003", abi != NULL);
+ assert("edward-2004", abi->nr_blocks_allocated >= *len);
+
+ abi->nr_blocks_allocated -= *len;
+ spin_unlock_atom(atom);
+ }
+
+ switch (target_stage) {
+ case BLOCK_NOT_COUNTED:
+ assert("vs-960", flags & BA_FORMATTED);
+ /* VITALY: This is what was grabbed for
+ internal/tx-lists/similar only */
+ used2free(sbinfo, *len, subv);
+ break;
+
+ case BLOCK_GRABBED:
+ used2grabbed(ctx, sbinfo, *len, subv);
+ break;
+
+ case BLOCK_UNALLOCATED:
+ used2fake_allocated(sbinfo,
+ *len, flags & BA_FORMATTED, subv);
+ break;
+
+ case BLOCK_FLUSH_RESERVED:{
+ txn_atom *atom;
+
+ atom = get_current_atom_locked();
+ used2flush_reserved(sbinfo, atom, *len,
+ flags & BA_FORMATTED, subv);
+ spin_unlock_atom(atom);
+ break;
+ }
+ default:
+ impossible("zam-532", "wrong block stage");
+ }
+ }
+
+ return 0;
+}
+
+/* wrappers for block allocator plugin methods */
+int reiser4_pre_commit_hook(void)
+{
+ assert("zam-502", get_current_super_private() != NULL);
+ sa_pre_commit_hook();
+ return 0;
+}
+
+/**
+ * an actor which applies a single delete set entry to block allocator
+ * data (actually to working bitmap)
+ */
+static int apply_dset(txn_atom *atom UNUSED_ARG, const reiser4_block_nr *a,
+ const reiser4_block_nr *b, __u32 subv_id,
+ void *data UNUSED_ARG)
+{
+ __u64 len = 1;
+ reiser4_subvol *subv;
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_current_super_private();
+ assert("zam-552", sbinfo != NULL);
+
+ subv = current_origin(subv_id);
+
+ assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
+
+
+ if (b != NULL)
+ len = *b;
+
+ if (REISER4_DEBUG) {
+ spin_lock_reiser4_super(sbinfo);
+
+ assert("zam-554", *a < reiser4_subvol_block_count(subv));
+ assert("zam-555", *a + len <= reiser4_subvol_block_count(subv));
+
+ spin_unlock_reiser4_super(sbinfo);
+ }
+ sa_dealloc_blocks(&subv->space_allocator, *a, len, subv);
+ used2free(sbinfo, len, subv);
+ return 0;
+}
+
+void reiser4_post_commit_hook(void)
+{
+#ifdef REISER4_DEBUG
+ txn_atom *atom;
+
+ atom = get_current_atom_locked();
+ assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
+ spin_unlock_atom(atom);
+#endif
+
+ assert("zam-504", get_current_super_private() != NULL);
+ sa_post_commit_hook();
+}
+
+void reiser4_post_write_back_hook(void)
+{
+ struct list_head discarded_set;
+ txn_atom *atom;
+ int ret;
+
+ /* process and issue discard requests */
+ blocknr_list_init (&discarded_set);
+ do {
+ atom = get_current_atom_locked();
+ ret = discard_atom(atom, &discarded_set);
+ } while (ret == -E_REPEAT);
+
+ if (ret) {
+ warning("intelfx-8", "discard atom failed (%d)", ret);
+ }
+
+ atom = get_current_atom_locked();
+ discard_atom_post(atom, &discarded_set);
+
+ /* do the block deallocation which was deferred
+ until commit is done */
+ atom_dset_deferred_apply(atom, apply_dset, NULL, 1);
+
+ assert("zam-504", get_current_super_private() != NULL);
+ sa_post_write_back_hook();
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/block_alloc.h linux-5.10.2/fs/reiser4/block_alloc.h
--- linux-5.10.2.orig/fs/reiser4/block_alloc.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/block_alloc.h 2020-12-23 16:07:46.113813056 +0100
@@ -0,0 +1,195 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined(__FS_REISER4_BLOCK_ALLOC_H__)
+#define __FS_REISER4_BLOCK_ALLOC_H__
+
+#include "dformat.h"
+#include "forward.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h>
+
+/* Mask when is applied to given block number shows is that block number is a
+ fake one */
+#define REISER4_FAKE_BLOCKNR_BIT_MASK 0x8000000000000000ULL
+/* Mask which isolates a type of object this fake block number was assigned
+ to */
+#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
+
+/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
+ against these two values to understand is the object unallocated or bitmap
+ shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
+#define REISER4_UNALLOCATED_STATUS_VALUE 0xC000000000000000ULL
+#define REISER4_BITMAP_BLOCKS_STATUS_VALUE 0x8000000000000000ULL
+
+/* specification how block allocation was counted in sb block counters */
+typedef enum {
+ BLOCK_NOT_COUNTED = 0, /* reiser4 has no info about this block yet */
+ BLOCK_GRABBED = 1, /* free space grabbed for further allocation
+ of this block */
+ BLOCK_FLUSH_RESERVED = 2, /* block is reserved for flush needs. */
+ BLOCK_UNALLOCATED = 3, /* block is used for existing in-memory object
+ ( unallocated formatted or unformatted
+ node) */
+ BLOCK_ALLOCATED = 4 /* block is mapped to disk, real on-disk block
+ number assigned */
+} block_stage_t;
+
+/* a hint for block allocator */
+struct reiser4_blocknr_hint {
+ /* FIXME: I think we want to add a longterm lock on the bitmap block
+ here. This is to prevent jnode_flush() calls from interleaving
+ allocations on the same bitmap, once a hint is established. */
+
+ /* search start hint */
+ reiser4_block_nr blk;
+ /* if not zero, it is a region size we search for free blocks in */
+ reiser4_block_nr max_dist;
+ /* level for allocation, may be useful have branch-level and higher
+ write-optimized. */
+ tree_level level;
+ /* block allocator assumes that blocks, which will be mapped to disk,
+ are in this specified block_stage */
+ block_stage_t block_stage;
+ /* If direction = 1 allocate blocks in backward direction from the end
+ * of disk to the beginning of disk. */
+ unsigned int backward:1;
+
+};
+
+/* These flags control block allocation/deallocation behavior */
+enum reiser4_ba_flags {
+ /* do allocatations from reserved (5%) area */
+ BA_RESERVED = (1 << 0),
+
+ /* block allocator can do commit trying to recover free space */
+ BA_CAN_COMMIT = (1 << 1),
+
+ /* if operation will be applied to formatted block */
+ BA_FORMATTED = (1 << 2),
+
+ /* defer actual block freeing until transaction commit */
+ BA_DEFER = (1 << 3),
+
+ /* allocate blocks for permanent fs objects (formatted or unformatted),
+ not wandered of log blocks */
+ BA_PERMANENT = (1 << 4),
+
+ /* grab space even it was disabled */
+ BA_FORCE = (1 << 5),
+
+ /* use default start value for free blocks search. */
+ BA_USE_DEFAULT_SEARCH_START = (1 << 6)
+};
+
+typedef enum reiser4_ba_flags reiser4_ba_flags_t;
+
+extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
+extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
+extern void update_blocknr_hint_default(const struct super_block *,
+ struct reiser4_subvol *,
+ const reiser4_block_nr *);
+extern void get_blocknr_hint_default(reiser4_block_nr *, reiser4_subvol *);
+
+extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
+
+int assign_fake_blocknr_formatted(reiser4_block_nr *, reiser4_subvol *subv);
+reiser4_block_nr fake_blocknr_unformatted(int, reiser4_subvol *subv);
+
+/* free -> grabbed -> fake_allocated -> used */
+
+int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags, reiser4_subvol *);
+void all_grabbed2free(void);
+void __grabbed2free(struct ctx_brick_info *cbi, reiser4_super_info_data *sbinfo,
+ __u64 count, reiser4_subvol *subv);
+void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count,
+ reiser4_subvol *);
+void fake_allocated2free(__u64 count,
+ reiser4_ba_flags_t flags, reiser4_subvol *);
+void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count,
+ reiser4_subvol *);
+void grabbed2flush_reserved(__u64 count, reiser4_subvol *subv);
+__u64 ctx_subvol_grabbed(reiser4_context *ctx, __u32 subvol_id);
+int reiser4_alloc_blocks(reiser4_blocknr_hint *hint,
+ reiser4_block_nr *start, reiser4_block_nr *len,
+ reiser4_ba_flags_t flags, reiser4_subvol *subvol);
+int reiser4_dealloc_blocks(const reiser4_block_nr *,
+ const reiser4_block_nr *,
+ block_stage_t, reiser4_ba_flags_t flags,
+ reiser4_subvol *);
+
+static inline int reiser4_alloc_block(reiser4_blocknr_hint *hint,
+ reiser4_block_nr *start,
+ reiser4_ba_flags_t flags,
+ reiser4_subvol *subvol)
+{
+ reiser4_block_nr one = 1;
+ return reiser4_alloc_blocks(hint, start, &one, flags, subvol);
+}
+
+static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
+ block_stage_t stage,
+ reiser4_ba_flags_t flags,
+ reiser4_subvol *subv)
+{
+ const reiser4_block_nr one = 1;
+ return reiser4_dealloc_blocks(block, &one, stage, flags, subv);
+}
+#define reiser4_grab_space_force(count, flags, subvol) \
+ reiser4_grab_space(count, flags | BA_FORCE, subvol)
+
+extern void allocate_blocks_unformatted(reiser4_blocknr_hint *preceder,
+ reiser4_block_nr wanted_count,
+ reiser4_block_nr *first_allocated,
+ reiser4_block_nr *allocated,
+ block_stage_t block_stage,
+ reiser4_subvol *subv);
+extern void grabbed2free_mark(__u64 mark, reiser4_subvol *subv);
+extern int reiser4_grab_reserved(struct super_block *, __u64,
+ reiser4_ba_flags_t, reiser4_subvol *);
+extern void reiser4_release_reserved(struct super_block *super);
+
+/* grabbed -> fake_allocated */
+
+/* fake_allocated -> used */
+
+/* used -> fake_allocated -> grabbed -> free */
+
+extern void flush_reserved2grabbed(atom_brick_info *abi, ctx_brick_info *cbi,
+ __u64 count, reiser4_subvol *subv);
+extern u64 all_flush_reserved2grabbed(txn_atom *atom);
+extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
+
+extern void grabbed2cluster_reserved(int count, reiser4_subvol *);
+extern void cluster_reserved2grabbed(int count, reiser4_subvol *);
+extern void cluster_reserved2free(int count, reiser4_subvol *);
+
+extern int subvol_check_block_counters(const reiser4_subvol *);
+extern int volume_check_block_counters(const struct super_block *super);
+
+extern int reiser4_check_blocks(const reiser4_block_nr *start,
+ const reiser4_block_nr *len, int desired,
+ reiser4_subvol *subv);
+
+static inline int reiser4_check_block(const reiser4_block_nr *start,
+ int desired, reiser4_subvol *subv)
+{
+ return reiser4_check_blocks(start, NULL, desired, subv);
+}
+
+extern int reiser4_pre_commit_hook(void);
+extern void reiser4_post_commit_hook(void);
+extern void reiser4_post_write_back_hook(void);
+
+
+#endif /* __FS_REISER4_BLOCK_ALLOC_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/blocknrlist.c linux-5.10.2/fs/reiser4/blocknrlist.c
--- linux-5.10.2.orig/fs/reiser4/blocknrlist.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/blocknrlist.c 2020-12-23 16:07:46.113813056 +0100
@@ -0,0 +1,349 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* This is a block list implementation, used to create ordered block sets
+ (at the cost of being less memory efficient than blocknr_set).
+ It is used by discard code. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "txnmgr.h"
+#include "context.h"
+#include "super.h"
+#include "plugin/volume/volume.h"
+
+#include <linux/slab.h>
+#include <linux/list_sort.h>
+
+static struct kmem_cache *blocknr_list_slab = NULL;
+
+/**
+ * Represents an extent range [@start; @end).
+ */
+struct blocknr_list_entry {
+ reiser4_block_nr start, len;
+ __u32 subv_id;
+ struct list_head link;
+};
+
+#define blocknr_list_entry(ptr) list_entry(ptr, blocknr_list_entry, link)
+
+static void blocknr_list_entry_init(blocknr_list_entry *entry)
+{
+ assert("intelfx-11", entry != NULL);
+
+ entry->start = 0;
+ entry->len = 0;
+ entry->subv_id = INVALID_SUBVOL_ID;
+ INIT_LIST_HEAD(&entry->link);
+}
+
+static blocknr_list_entry *blocknr_list_entry_alloc(void)
+{
+ blocknr_list_entry *entry;
+
+ entry = (blocknr_list_entry *)kmem_cache_alloc(blocknr_list_slab,
+ reiser4_ctx_gfp_mask_get());
+ if (entry == NULL) {
+ return NULL;
+ }
+
+ blocknr_list_entry_init(entry);
+
+ return entry;
+}
+
+static void blocknr_list_entry_free(blocknr_list_entry *entry)
+{
+ assert("intelfx-12", entry != NULL);
+
+ kmem_cache_free(blocknr_list_slab, entry);
+}
+
+/**
+ * Given ranges @to and [@start; @end), if they overlap, their union
+ * is calculated and saved in @to.
+ */
+static int blocknr_list_entry_merge(blocknr_list_entry *to,
+ reiser4_block_nr start,
+ reiser4_block_nr len,
+ __u32 subv_id)
+{
+ reiser4_block_nr end, to_end;
+
+ assert("intelfx-13", to != NULL);
+
+ assert("intelfx-16", to->len > 0);
+ assert("intelfx-17", len > 0);
+ assert("edward-1806", subv_id != INVALID_SUBVOL_ID);
+ assert("edward-1807", to->subv_id != INVALID_SUBVOL_ID);
+
+ end = start + len;
+ to_end = to->start + to->len;
+
+ if ((to->subv_id == subv_id) &&
+ (to->start <= end) && (start <= to_end)) {
+
+ if (start < to->start)
+ to->start = start;
+ if (end > to_end)
+ to_end = end;
+ to->len = to_end - to->start;
+ return 0;
+ }
+ return -1;
+}
+
+static int blocknr_list_entry_merge_entry(blocknr_list_entry *to,
+ blocknr_list_entry *from)
+{
+ assert("intelfx-18", from != NULL);
+
+ return blocknr_list_entry_merge(to, from->start, from->len,
+ from->subv_id);
+}
+
+/**
+ * A comparison function for list_sort().
+ *
+ * "The comparison function @cmp must return a negative value if @a
+ * should sort before @b, and a positive value if @a should sort after
+ * @b. If @a and @b are equivalent, and their original relative
+ * ordering is to be preserved, @cmp must return 0."
+ */
+static int blocknr_list_entry_compare(void* priv UNUSED_ARG,
+ struct list_head *a, struct list_head *b)
+{
+ blocknr_list_entry *entry_a, *entry_b;
+ reiser4_block_nr entry_a_end, entry_b_end;
+
+ assert("intelfx-19", a != NULL);
+ assert("intelfx-20", b != NULL);
+
+ entry_a = blocknr_list_entry(a);
+ entry_b = blocknr_list_entry(b);
+
+ assert("edward-1808", entry_a->subv_id != INVALID_SUBVOL_ID);
+ assert("edward-1809", entry_b->subv_id != INVALID_SUBVOL_ID);
+
+ entry_a_end = entry_a->start + entry_a->len;
+ entry_b_end = entry_b->start + entry_b->len;
+
+ /* First sort by subvolume ids... */
+
+ if (entry_a->subv_id < entry_b->subv_id)
+ return -1;
+ if (entry_a->subv_id > entry_b->subv_id)
+ return 1;
+
+ /* Then sort by starting block numbers... */
+
+ if (entry_a->start < entry_b->start)
+ return -1;
+ if (entry_a->start > entry_b->start)
+ return 1;
+
+ /** Then by ending block numbers.
+ * If @a contains @b, it will be sorted before. */
+ if (entry_a_end > entry_b_end)
+ return -1;
+ if (entry_a_end < entry_b_end)
+ return 1;
+ return 0;
+}
+
+int blocknr_list_init_static(void)
+{
+ assert("intelfx-54", blocknr_list_slab == NULL);
+
+ blocknr_list_slab = kmem_cache_create("blocknr_list_entry",
+ sizeof(blocknr_list_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT,
+ NULL);
+ if (blocknr_list_slab == NULL) {
+ return RETERR(-ENOMEM);
+ }
+
+ return 0;
+}
+
+void blocknr_list_done_static(void)
+{
+ destroy_reiser4_cache(&blocknr_list_slab);
+}
+
+void blocknr_list_init(struct list_head* blist)
+{
+ assert("intelfx-24", blist != NULL);
+
+ INIT_LIST_HEAD(blist);
+}
+
+void blocknr_list_destroy(struct list_head* blist)
+{
+ struct list_head *pos, *tmp;
+ blocknr_list_entry *entry;
+
+ assert("intelfx-25", blist != NULL);
+
+ list_for_each_safe(pos, tmp, blist) {
+ entry = blocknr_list_entry(pos);
+ list_del_init(pos);
+ blocknr_list_entry_free(entry);
+ }
+
+ assert("intelfx-48", list_empty(blist));
+}
+
+void blocknr_list_merge(struct list_head *from, struct list_head *to)
+{
+ assert("intelfx-26", from != NULL);
+ assert("intelfx-27", to != NULL);
+
+ list_splice_tail_init(from, to);
+
+ assert("intelfx-49", list_empty(from));
+}
+
+void blocknr_list_sort_and_join(struct list_head *blist)
+{
+ struct list_head *pos, *next;
+ struct blocknr_list_entry *entry, *next_entry;
+
+ assert("intelfx-50", blist != NULL);
+
+ /* Step 1. Sort the extent list. */
+ list_sort(NULL, blist, blocknr_list_entry_compare);
+
+ /* Step 2. Join adjacent extents in the list. */
+ pos = blist->next;
+ next = pos->next;
+ entry = blocknr_list_entry(pos);
+
+ for (; next != blist; next = pos->next) {
+ /** @next is a valid node at this point */
+ next_entry = blocknr_list_entry(next);
+
+ /** try to merge @next into @pos */
+ if (!blocknr_list_entry_merge_entry(entry, next_entry)) {
+ /** successful; delete the @next node.
+ * next merge will be attempted into the same node. */
+ list_del_init(next);
+ blocknr_list_entry_free(next_entry);
+ } else {
+ /** otherwise advance @pos. */
+ pos = next;
+ entry = next_entry;
+ }
+ }
+}
+
+int blocknr_list_add_extent(txn_atom *atom,
+ struct list_head *blist,
+ blocknr_list_entry **new_entry,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ __u32 subv_id)
+{
+ assert("intelfx-29", atom != NULL);
+ assert("intelfx-42", atom_is_protected(atom));
+ assert("intelfx-43", blist != NULL);
+ assert("intelfx-30", new_entry != NULL);
+ assert("intelfx-31", start != NULL);
+ assert("intelfx-32", len != NULL && *len > 0);
+
+ if (*new_entry == NULL) {
+ /*
+ * Optimization: try to merge new extent into the last one.
+ */
+ if (!list_empty(blist)) {
+ blocknr_list_entry *last_entry;
+ last_entry = blocknr_list_entry(blist->prev);
+ if (!blocknr_list_entry_merge(last_entry,
+ *start, *len,
+ subv_id)) {
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, allocate a new entry and tell -E_REPEAT.
+ * Next time we'll take the branch below.
+ */
+ spin_unlock_atom(atom);
+ *new_entry = blocknr_list_entry_alloc();
+ return (*new_entry != NULL) ? -E_REPEAT : RETERR(-ENOMEM);
+ }
+
+ /*
+ * The entry has been allocated beforehand, fill it and link to the list.
+ */
+ (*new_entry)->start = *start;
+ (*new_entry)->len = *len;
+ (*new_entry)->subv_id = subv_id;
+ list_add_tail(&(*new_entry)->link, blist);
+
+ return 0;
+}
+
+int blocknr_list_iterator(txn_atom *atom,
+ struct list_head *blist,
+ blocknr_set_actor_f actor,
+ void *data,
+ int delete)
+{
+ struct list_head *pos;
+ blocknr_list_entry *entry;
+ int ret = 0;
+
+ assert("intelfx-46", blist != NULL);
+ assert("intelfx-47", actor != NULL);
+
+ if (delete) {
+ struct list_head *tmp;
+
+ list_for_each_safe(pos, tmp, blist) {
+ entry = blocknr_list_entry(pos);
+
+ /*
+ * Do not exit, delete flag is set. Instead, on the first error we
+ * downgrade from iterating to just deleting.
+ */
+ if (ret == 0) {
+ ret = actor(atom, &entry->start, &entry->len,
+ entry->subv_id, data);
+ }
+
+ list_del_init(pos);
+ blocknr_list_entry_free(entry);
+ }
+
+ assert("intelfx-44", list_empty(blist));
+ } else {
+ list_for_each(pos, blist) {
+ entry = blocknr_list_entry(pos);
+
+ ret = actor(atom, &entry->start, &entry->len,
+ entry->subv_id, data);
+
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/blocknrset.c linux-5.10.2/fs/reiser4/blocknrset.c
--- linux-5.10.2.orig/fs/reiser4/blocknrset.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/blocknrset.c 2020-12-23 16:07:46.113813056 +0100
@@ -0,0 +1,402 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+reiser4/README */
+
+/* This file contains code for various block number sets used by the atom to
+ track the deleted set and wandered block mappings. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "txnmgr.h"
+#include "context.h"
+#include "super.h"
+
+#include <linux/slab.h>
+
+/* The proposed data structure for storing unordered block number sets is a
+ list of elements, each of which contains an array of block number or/and
+ array of block number pairs. That element called blocknr_set_entry is used
+ to store block numbers from the beginning and for extents from the end of
+ the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
+ count numbers of blocks and extents.
+
+ +------------------- blocknr_set_entry->data ------------------+
+ |block1|block2| ... <free space> ... |pair3|pair2|pair1|
+ +------------------------------------------------------------+
+
+ When current blocknr_set_entry is full, allocate a new one. */
+
+/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
+ * set (single blocks and block extents), in that case blocknr pair represent an
+ * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
+ * there represent a (real block) -> (wandered block) mapping. */
+
+/* Protection: blocknr sets belong to reiser4 atom, and
+ * their modifications are performed with the atom lock held */
+
+/* The total size of a blocknr_set_entry. */
+#define BLOCKNR_SET_ENTRY_SIZE 128
+
+/* The number of blocks that can fit the blocknr data area. */
+#define BLOCKNR_SET_ENTRIES_NUMBER \
+ ((BLOCKNR_SET_ENTRY_SIZE - \
+ 2 * sizeof(unsigned) - \
+ sizeof(struct list_head)) / \
+ sizeof(reiser4_block_nr))
+
+static struct kmem_cache *blocknr_set_slab = NULL;
+
+/* An entry of the blocknr_set */
+struct blocknr_set_entry {
+ unsigned nr_singles;
+ unsigned nr_pairs;
+ struct list_head link;
+ reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
+};
+
+static_assert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
+
+/* A pair of blocks as recorded in the blocknr_set_entry data. */
+struct blocknr_pair {
+ reiser4_block_nr a;
+ reiser4_block_nr b;
+};
+
+/* Return the number of blocknr slots available in a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static unsigned bse_avail(blocknr_set_entry * bse)
+{
+ unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
+
+ assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
+
+ return BLOCKNR_SET_ENTRIES_NUMBER - used;
+}
+
+/* Initialize a blocknr_set_entry. */
+static void bse_init(blocknr_set_entry *bse)
+{
+ bse->nr_singles = 0;
+ bse->nr_pairs = 0;
+ INIT_LIST_HEAD(&bse->link);
+}
+
+/* Allocate and initialize a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static blocknr_set_entry *bse_alloc(void)
+{
+ blocknr_set_entry *e;
+
+ if ((e = (blocknr_set_entry *) kmem_cache_alloc(blocknr_set_slab,
+ reiser4_ctx_gfp_mask_get())) == NULL)
+ return NULL;
+
+ bse_init(e);
+
+ return e;
+}
+
+/* Free a blocknr_set_entry. */
+/* Audited by: green(2002.06.11) */
+static void bse_free(blocknr_set_entry * bse)
+{
+ kmem_cache_free(blocknr_set_slab, bse);
+}
+
+/* Add a block number to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
+{
+ assert("jmacd-5099", bse_avail(bse) >= 1);
+
+ bse->entries[bse->nr_singles++] = *block;
+}
+
+/* Get a pair of block numbers */
+/* Audited by: green(2002.06.11) */
+static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
+ unsigned pno)
+{
+ assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
+
+ return (struct blocknr_pair *) (bse->entries +
+ BLOCKNR_SET_ENTRIES_NUMBER -
+ 2 * (pno + 1));
+}
+
+/* Add a pair of block numbers to a blocknr_set_entry */
+/* Audited by: green(2002.06.11) */
+static void
+bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
+ const reiser4_block_nr * b)
+{
+ struct blocknr_pair *pair;
+
+ assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
+
+ pair = bse_get_pair(bse, bse->nr_pairs++);
+
+ pair->a = *a;
+ pair->b = *b;
+}
+
+/* Add either a block or pair of blocks to the block number set. The first
+ blocknr (@a) must be non-NULL. If @b is NULL a single blocknr is added, if
+ @b is non-NULL a pair is added. The block number set belongs to atom, and
+ the call is made with the atom lock held. There may not be enough space in
+ the current blocknr_set_entry. If new_bsep points to a non-NULL
+ blocknr_set_entry then it will be added to the blocknr_set and new_bsep
+ will be set to NULL. If new_bsep contains NULL then the atom lock will be
+ released and a new bse will be allocated in new_bsep. E_REPEAT will be
+ returned with the atom unlocked for the operation to be tried again. If
+ the operation succeeds, 0 is returned. If new_bsep is non-NULL and not
+ used during the call, it will be freed automatically. */
+static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
+ blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
+ const reiser4_block_nr *b, __u32 subvol_id)
+{
+ blocknr_set_entry *bse;
+ unsigned entries_needed;
+
+ assert("jmacd-5101", a != NULL);
+
+ entries_needed = (b == NULL) ? 1 : 2;
+ if (list_empty(bset) ||
+ bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
+ /* See if a bse was previously allocated. */
+ if (*new_bsep == NULL) {
+ spin_unlock_atom(atom);
+ *new_bsep = bse_alloc();
+ return (*new_bsep != NULL) ? -E_REPEAT :
+ RETERR(-ENOMEM);
+ }
+
+ /* Put it on the head of the list. */
+ list_add(&((*new_bsep)->link), bset);
+
+ *new_bsep = NULL;
+ }
+
+ /* Add the single or pair. */
+ bse = list_entry(bset->next, blocknr_set_entry, link);
+ if (b == NULL) {
+ bse_put_single(bse, a);
+ } else {
+ bse_put_pair(bse, a, b);
+ }
+
+ /* If new_bsep is non-NULL then there was an allocation race, free this
+ copy. */
+ if (*new_bsep != NULL) {
+ bse_free(*new_bsep);
+ *new_bsep = NULL;
+ }
+
+ return 0;
+}
+
+/* Add an extent to the block set. If the length is 1, it is treated as a
+ single block (e.g., reiser4_set_add_block). */
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+ kmalloc might schedule. The only exception is atom spinlock, which is
+ properly freed.
+*/
+int blocknr_set_add_extent(txn_atom * atom,
+ struct list_head *bset,
+ blocknr_set_entry **new_bsep,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ const __u32 subvol_id)
+{
+ assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
+ return blocknr_set_add(atom, bset, new_bsep, start,
+ *len == 1 ? NULL : len, subvol_id);
+}
+
+/* Add a block pair to the block set. It adds exactly a pair, which is checked
+ * by an assertion that both arguments are not null.*/
+/* Audited by: green(2002.06.11) */
+/* Auditor note: Entire call chain cannot hold any spinlocks, because
+ kmalloc might schedule. The only exception is atom spinlock, which is
+ properly freed
+*/
+int blocknr_set_add_pair(txn_atom * atom,
+ struct list_head *bset, blocknr_set_entry **new_bsep,
+ const reiser4_block_nr *a, const reiser4_block_nr *b,
+ __u32 subvol_id)
+{
+ assert("jmacd-5103", a != NULL && b != NULL);
+ return blocknr_set_add(atom, bset, new_bsep, a, b, subvol_id);
+}
+
+/* Initialize slab cache of blocknr_set_entry objects. */
+int blocknr_set_init_static(void)
+{
+ assert("intelfx-55", blocknr_set_slab == NULL);
+
+ blocknr_set_slab = kmem_cache_create("blocknr_set_entry",
+ sizeof(blocknr_set_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT,
+ NULL);
+
+ if (blocknr_set_slab == NULL) {
+ return RETERR(-ENOMEM);
+ }
+
+ return 0;
+}
+
+/* Destroy slab cache of blocknr_set_entry objects. */
+void blocknr_set_done_static(void)
+{
+ destroy_reiser4_cache(&blocknr_set_slab);
+}
+
+/* Initialize a blocknr_set. */
+void blocknr_set_init(struct list_head *bset)
+{
+ INIT_LIST_HEAD(bset);
+}
+
+/* Release the entries of a blocknr_set. */
+void blocknr_set_destroy(struct list_head *bset)
+{
+ blocknr_set_entry *bse;
+
+ while (!list_empty(bset)) {
+ bse = list_entry(bset->next, blocknr_set_entry, link);
+ list_del_init(&bse->link);
+ bse_free(bse);
+ }
+}
+
+/* Merge blocknr_set entries out of @from into @into. */
+/* Audited by: green(2002.06.11) */
+/* Auditor comments: This merge does not know if merged sets contain
+ blocks pairs (As for wandered sets) or extents, so it cannot really merge
+ overlapping ranges if there is some. So I believe it may lead to
+ some blocks being presented several times in one blocknr_set. To help
+ debugging such problems it might help to check for duplicate entries on
+ actual processing of this set. Testing this kind of stuff right here is
+ also complicated by the fact that these sets are not sorted and going
+ through whole set on each element addition is going to be CPU-heavy task */
+void blocknr_set_merge(struct list_head *from, struct list_head *into)
+{
+ blocknr_set_entry *bse_into = NULL;
+
+ /* If @from is empty, no work to perform. */
+ if (list_empty(from))
+ return;
+ /* If @into is not empty, try merging partial-entries. */
+ if (!list_empty(into)) {
+
+ /* Neither set is empty, pop the front to members and try to
+ combine them. */
+ blocknr_set_entry *bse_from;
+ unsigned into_avail;
+
+ bse_into = list_entry(into->next, blocknr_set_entry, link);
+ list_del_init(&bse_into->link);
+ bse_from = list_entry(from->next, blocknr_set_entry, link);
+ list_del_init(&bse_from->link);
+
+ /* Combine singles. */
+ for (into_avail = bse_avail(bse_into);
+ into_avail != 0 && bse_from->nr_singles != 0;
+ into_avail -= 1) {
+ bse_put_single(bse_into,
+ &bse_from->entries[--bse_from->
+ nr_singles]);
+ }
+
+ /* Combine pairs. */
+ for (; into_avail > 1 && bse_from->nr_pairs != 0;
+ into_avail -= 2) {
+ struct blocknr_pair *pair =
+ bse_get_pair(bse_from, --bse_from->nr_pairs);
+ bse_put_pair(bse_into, &pair->a, &pair->b);
+ }
+
+ /* If bse_from is empty, delete it now. */
+ if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
+ bse_free(bse_from);
+ } else {
+ /* Otherwise, bse_into is full or nearly full (e.g.,
+ it could have one slot avail and bse_from has one
+ pair left). Push it back onto the list. bse_from
+ becomes bse_into, which will be the new partial. */
+ list_add(&bse_into->link, into);
+ bse_into = bse_from;
+ }
+ }
+
+ /* Splice lists together. */
+ list_splice_init(from, into->prev);
+
+ /* Add the partial entry back to the head of the list. */
+ if (bse_into != NULL)
+ list_add(&bse_into->link, into);
+}
+
+/* Iterate over all blocknr set elements. */
+int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
+ blocknr_set_actor_f actor, void *data, int delete,
+ u32 subv_id)
+{
+
+ blocknr_set_entry *entry;
+
+ assert("zam-429", atom != NULL);
+ assert("zam-430", atom_is_protected(atom));
+ assert("zam-431", bset != 0);
+ assert("zam-432", actor != NULL);
+
+ entry = list_entry(bset->next, blocknr_set_entry, link);
+ while (bset != &entry->link) {
+ blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
+ unsigned int i;
+ int ret;
+
+ for (i = 0; i < entry->nr_singles; i++) {
+ ret = actor(atom, &entry->entries[i], NULL, 0, data);
+
+ /* We can't break a loop if delete flag is set. */
+ if (ret != 0 && !delete)
+ return ret;
+ }
+
+ for (i = 0; i < entry->nr_pairs; i++) {
+ struct blocknr_pair *ab;
+
+ ab = bse_get_pair(entry, i);
+
+ ret = actor(atom, &ab->a, &ab->b, subv_id, data);
+
+ if (ret != 0 && !delete)
+ return ret;
+ }
+
+ if (delete) {
+ list_del(&entry->link);
+ bse_free(entry);
+ }
+
+ entry = tmp;
+ }
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/carry.c linux-5.10.2/fs/reiser4/carry.c
--- linux-5.10.2.orig/fs/reiser4/carry.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/carry.c 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,1408 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+/* Functions to "carry" tree modification(s) upward. */
+/* Tree is modified one level at a time. As we modify a level we accumulate a
+ set of changes that need to be propagated to the next level. We manage
+ node locking such that any searches that collide with carrying are
+ restarted, from the root if necessary.
+
+ Insertion of a new item may result in items being moved among nodes and
+ this requires the delimiting key to be updated at the least common parent
+ of the nodes modified to preserve search tree invariants. Also, insertion
+ may require allocation of a new node. A pointer to the new node has to be
+ inserted into some node on the parent level, etc.
+
+ Tree carrying is meant to be analogous to arithmetic carrying.
+
+ A carry operation is always associated with some node (&carry_node).
+
+ Carry process starts with some initial set of operations to be performed
+ and an initial set of already locked nodes. Operations are performed one
+ by one. Performing each single operation has following possible effects:
+
+ - content of carry node associated with operation is modified
+ - new carry nodes are locked and involved into carry process on this level
+ - new carry operations are posted to the next level
+
+ After all carry operations on this level are done, process is repeated for
+ the accumulated sequence on carry operations for the next level. This
+ starts by trying to lock (in left to right order) all carry nodes
+ associated with carry operations on the parent level. After this, we decide
+ whether more nodes are required on the left of already locked set. If so,
+ all locks taken on the parent level are released, new carry nodes are
+ added, and locking process repeats.
+
+ It may happen that balancing process fails owing to unrecoverable error on
+ some of upper levels of a tree (possible causes are io error, failure to
+ allocate new node, etc.). In this case we should unmount the filesystem,
+ rebooting if it is the root, and possibly advise the use of fsck.
+
+ USAGE:
+
+ int some_tree_operation( znode *node, ... )
+ {
+ // Allocate on a stack pool of carry objects: operations and nodes.
+ // Most carry processes will only take objects from here, without
+ // dynamic allocation.
+
+I feel uneasy about this pool. It adds to code complexity, I understand why it
+exists, but.... -Hans
+
+ carry_pool pool;
+ carry_level lowest_level;
+ carry_op *op;
+
+ init_carry_pool( &pool );
+ init_carry_level( &lowest_level, &pool );
+
+ // operation may be one of:
+ // COP_INSERT --- insert new item into node
+ // COP_CUT --- remove part of or whole node
+ // COP_PASTE --- increase size of item
+ // COP_DELETE --- delete pointer from parent node
+ // COP_UPDATE --- update delimiting key in least
+ // common ancestor of two
+
+ op = reiser4_post_carry( &lowest_level, operation, node, 0 );
+ if( IS_ERR( op ) || ( op == NULL ) ) {
+ handle error
+ } else {
+ // fill in remaining fields in @op, according to carry.h:carry_op
+ result = carry(&lowest_level, NULL);
+ }
+ done_carry_pool(&pool);
+ }
+
+ When you are implementing node plugin method that participates in carry
+ (shifting, insertion, deletion, etc.), do the following:
+
+ int foo_node_method(znode * node, ..., carry_level * todo)
+ {
+ carry_op *op;
+
+ ....
+
+ // note, that last argument to reiser4_post_carry() is non-null
+ // here, because @op is to be applied to the parent of @node, rather
+ // than to the @node itself as in the previous case.
+
+ op = node_post_carry(todo, operation, node, 1);
+ // fill in remaining fields in @op, according to carry.h:carry_op
+
+ ....
+
+ }
+
+ BATCHING:
+
+ One of the main advantages of level-by-level balancing implemented here is
+ ability to batch updates on a parent level and to peform them more
+ efficiently as a result.
+
+ Description To Be Done (TBD).
+
+ DIFFICULTIES AND SUBTLE POINTS:
+
+ 1. complex plumbing is required, because:
+
+ a. effective allocation through pools is needed
+
+ b. target of operation is not exactly known when operation is
+ posted. This is worked around through bitfields in &carry_node and
+ logic in lock_carry_node()
+
+ c. of interaction with locking code: node should be added into sibling
+ list when pointer to it is inserted into its parent, which is some time
+ after node was created. Between these moments, node is somewhat in
+ suspended state and is only registered in the carry lists
+
+ 2. whole balancing logic is implemented here, in particular, insertion
+ logic is coded in make_space().
+
+ 3. special cases like insertion (reiser4_add_tree_root()) or deletion
+ (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
+ (insert_paste()) have to be handled.
+
+ 4. there is non-trivial interdependency between allocation of new nodes
+ and almost everything else. This is mainly due to the (1.c) above. I shall
+ write about this later.
+
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/item/extent.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "tree_mod.h"
+#include "tree_walk.h"
+#include "block_alloc.h"
+#include "pool.h"
+#include "tree.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h>
+
+/* level locking/unlocking */
+static int lock_carry_level(carry_level * level);
+static void unlock_carry_level(carry_level * level, int failure);
+static void done_carry_level(carry_level * level);
+static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
+
+int lock_carry_node(carry_level * level, carry_node * node);
+int lock_carry_node_tail(carry_node * node);
+
+/* carry processing proper */
+static int carry_on_level(carry_level * doing, carry_level * todo);
+
+static carry_op *add_op(carry_level * level, pool_ordering order,
+ carry_op * reference);
+
+/* handlers for carry operations. */
+
+static void fatal_carry_error(carry_level * doing, int ecode);
+static int add_new_root(carry_level * level, carry_node * node, znode * fake);
+
+static void print_level(const char *prefix, carry_level * level);
+
+#if REISER4_DEBUG
+typedef enum {
+ CARRY_TODO,
+ CARRY_DOING
+} carry_queue_state;
+static int carry_level_invariant(carry_level * level, carry_queue_state state);
+#endif
+
+/* main entry point for tree balancing.
+
+ Tree carry performs operations from @doing and while doing so accumulates
+ information about operations to be performed on the next level ("carried"
+ to the parent level). Carried operations are performed, causing possibly
+ more operations to be carried upward etc. carry() takes care about
+ locking and pinning znodes while operating on them.
+
+ For usage, see comment at the top of fs/reiser4/carry.c
+
+*/
+int reiser4_carry(carry_level * doing /* set of carry operations to be
+ * performed */ ,
+ carry_level * done /* set of nodes, already performed
+ * at the previous level.
+ * NULL in most cases */)
+{
+ int result = 0;
+ gfp_t old_mask;
+ /* queue of new requests */
+ carry_level *todo;
+ ON_DEBUG(STORE_COUNTERS);
+
+ assert("nikita-888", doing != NULL);
+ BUG_ON(done != NULL);
+
+ todo = doing + 1;
+ init_carry_level(todo, doing->pool);
+
+ /* queue of requests preformed on the previous level */
+ done = todo + 1;
+ init_carry_level(done, doing->pool);
+ /*
+ * NOTE: We are not allowed to fail in the loop below.
+ * Incomplete carry (even if carry_on_level is complete)
+ * can leave the tree in an inconsistent state (broken
+ * order of keys in a node, etc).
+ */
+ old_mask = get_current_context()->gfp_mask;
+ get_current_context()->gfp_mask |= __GFP_NOFAIL;
+
+ /* iterate until there is nothing more to do */
+ while (result == 0 && doing->ops_num > 0) {
+ carry_level *tmp;
+
+ /* at this point @done is locked. */
+ /* repeat lock/do/unlock while
+
+ (1) lock_carry_level() fails due to deadlock avoidance, or
+
+ (2) carry_on_level() decides that more nodes have to
+ be involved.
+
+ (3) some unexpected error occurred while balancing on the
+ upper levels. In this case all changes are rolled back.
+
+ */
+ while (1) {
+ result = lock_carry_level(doing);
+ if (result == 0) {
+ /* perform operations from @doing and
+ accumulate new requests in @todo */
+ result = carry_on_level(doing, todo);
+ if (result == 0)
+ break;
+ else if (result != -E_REPEAT ||
+ !doing->restartable) {
+ warning("nikita-1043",
+ "Fatal error during carry: %i",
+ result);
+ print_level("done", done);
+ print_level("doing", doing);
+ print_level("todo", todo);
+ /* do some rough stuff like aborting
+ all pending transcrashes and thus
+ pushing tree back to the consistent
+ state. Alternatvely, just panic.
+ */
+ fatal_carry_error(doing, result);
+ return result;
+ }
+ } else if (result != -E_REPEAT) {
+ fatal_carry_error(doing, result);
+ return result;
+ }
+ unlock_carry_level(doing, 1);
+ }
+ /* at this point @done can be safely unlocked */
+ done_carry_level(done);
+
+ /* cyclically shift queues */
+ tmp = done;
+ done = doing;
+ doing = todo;
+ todo = tmp;
+ init_carry_level(todo, doing->pool);
+
+ /* give other threads chance to run */
+ reiser4_preempt_point();
+ }
+ get_current_context()->gfp_mask = old_mask;
+ done_carry_level(done);
+
+ /* all counters, but x_refs should remain the same. x_refs can change
+ owing to transaction manager */
+ ON_DEBUG(CHECK_COUNTERS);
+ return result;
+}
+
+/* perform carry operations on given level.
+
+ Optimizations proposed by pooh:
+
+ (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
+ required;
+
+ (2) unlock node if there are no more operations to be performed upon it and
+ node didn't add any operation to @todo. This can be implemented by
+ attaching to each node two counters: counter of operaions working on this
+ node and counter and operations carried upward from this node.
+
+*/
+static int carry_on_level(carry_level * doing /* queue of carry operations to
+ * do on this level */ ,
+ carry_level * todo /* queue where new carry
+ * operations to be performed on
+ * the * parent level are
+ * accumulated during @doing
+ * processing. */ )
+{
+ int result;
+ int (*f) (carry_op *, carry_level *, carry_level *);
+ carry_op *op;
+ carry_op *tmp_op;
+
+ assert("nikita-1034", doing != NULL);
+ assert("nikita-1035", todo != NULL);
+
+ /* @doing->nodes are locked. */
+
+ /* This function can be split into two phases: analysis and modification
+
+ Analysis calculates precisely what items should be moved between
+ nodes. This information is gathered in some structures attached to
+ each carry_node in a @doing queue. Analysis also determines whether
+ new nodes are to be allocated etc.
+
+ After analysis is completed, actual modification is performed. Here
+ we can take advantage of "batch modification": if there are several
+ operations acting on the same node, modifications can be performed
+ more efficiently when batched together.
+
+ Above is an optimization left for the future.
+ */
+ /* Important, but delayed optimization: it's possible to batch
+ operations together and perform them more efficiently as a
+ result. For example, deletion of several neighboring items from a
+ node can be converted to a single ->cut() operation.
+
+ Before processing queue, it should be scanned and "mergeable"
+ operations merged.
+ */
+ result = 0;
+ for_all_ops(doing, op, tmp_op) {
+ carry_opcode opcode;
+
+ assert("nikita-1041", op != NULL);
+ opcode = op->op;
+ assert("nikita-1042", op->op < COP_LAST_OP);
+ f = op_dispatch_table[op->op].cop_handler;
+ result = f(op, doing, todo);
+ /* locking can fail with -E_REPEAT. Any different error is fatal
+ and will be handled by fatal_carry_error() sledgehammer.
+ */
+ if (result != 0)
+ break;
+ }
+ if (result == 0) {
+ carry_plugin_info info;
+ carry_node *scan;
+ carry_node *tmp_scan;
+
+ info.doing = doing;
+ info.todo = todo;
+
+ assert("nikita-3002",
+ carry_level_invariant(doing, CARRY_DOING));
+ for_all_nodes(doing, scan, tmp_scan) {
+ znode *node;
+
+ node = reiser4_carry_real(scan);
+ assert("nikita-2547", node != NULL);
+ if (node_is_empty(node)) {
+ result =
+ node_plugin_by_node(node)->
+ prepare_removal(node, &info);
+ if (result != 0)
+ break;
+ }
+ }
+ }
+ return result;
+}
+
+/* post carry operation
+
+ This is main function used by external carry clients: node layout plugins
+ and tree operations to create new carry operation to be performed on some
+ level.
+
+ New operation will be included in the @level queue. To actually perform it,
+ call carry( level, ... ). This function takes write lock on @node. Carry
+ manages all its locks by itself, don't worry about this.
+
+ This function adds operation and node at the end of the queue. It is up to
+ caller to guarantee proper ordering of node queue.
+
+*/
+carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
+ * is to be posted at */ ,
+ carry_opcode op /* opcode of operation */ ,
+ znode * node /* node on which this operation
+ * will operate */ ,
+ int apply_to_parent_p /* whether operation will
+ * operate directly on @node
+ * or on it parent. */)
+{
+ carry_op *result;
+ carry_node *child;
+
+ assert("nikita-1046", level != NULL);
+ assert("nikita-1788", znode_is_write_locked(node));
+
+ result = add_op(level, POOLO_LAST, NULL);
+ if (IS_ERR(result))
+ return result;
+ child = reiser4_add_carry(level, POOLO_LAST, NULL);
+ if (IS_ERR(child)) {
+ reiser4_pool_free(&level->pool->op_pool, &result->header);
+ return (carry_op *) child;
+ }
+ result->node = child;
+ result->op = op;
+ child->parent = apply_to_parent_p;
+ if (ZF_ISSET(node, JNODE_ORPHAN))
+ child->left_before = 1;
+ child->node = node;
+ return result;
+}
+
+/* initialize carry queue */
+void init_carry_level(carry_level * level /* level to initialize */ ,
+ carry_pool * pool /* pool @level will allocate objects
+ * from */ )
+{
+ assert("nikita-1045", level != NULL);
+ assert("nikita-967", pool != NULL);
+
+ memset(level, 0, sizeof *level);
+ level->pool = pool;
+
+ INIT_LIST_HEAD(&level->nodes);
+ INIT_LIST_HEAD(&level->ops);
+}
+
+/* allocate carry pool and initialize pools within queue */
+carry_pool *init_carry_pool(int size)
+{
+ carry_pool *pool;
+
+ assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
+ pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
+ if (pool == NULL)
+ return ERR_PTR(RETERR(-ENOMEM));
+
+ reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
+ (char *)pool->op);
+ reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
+ NODES_LOCKED_POOL_SIZE, (char *)pool->node);
+ return pool;
+}
+
+/* finish with queue pools */
+void done_carry_pool(carry_pool * pool/* pool to destroy */)
+{
+ reiser4_done_pool(&pool->op_pool);
+ reiser4_done_pool(&pool->node_pool);
+ kfree(pool);
+}
+
+/* add new carry node to the @level.
+
+ Returns pointer to the new carry node allocated from pool. It's up to
+ callers to maintain proper order in the @level. Assumption is that if carry
+ nodes on one level are already sorted and modifications are peroformed from
+ left to right, carry nodes added on the parent level will be ordered
+ automatically. To control ordering use @order and @reference parameters.
+
+*/
+carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
+ * node to */ ,
+ pool_ordering order /* where to insert:
+ * at the beginning of
+ * @level,
+ * before @reference,
+ * after @reference,
+ * at the end of @level
+ */ ,
+ carry_node * reference/* reference node for
+ * insertion */)
+{
+ ON_DEBUG(carry_node * orig_ref = reference);
+
+ if (order == POOLO_BEFORE) {
+ reference = find_left_carry(reference, level);
+ if (reference == NULL)
+ reference = list_entry(level->nodes.next, carry_node,
+ header.level_linkage);
+ else
+ reference = list_entry(reference->header.level_linkage.next,
+ carry_node, header.level_linkage);
+ } else if (order == POOLO_AFTER) {
+ reference = find_right_carry(reference, level);
+ if (reference == NULL)
+ reference = list_entry(level->nodes.prev, carry_node,
+ header.level_linkage);
+ else
+ reference = list_entry(reference->header.level_linkage.prev,
+ carry_node, header.level_linkage);
+ }
+ assert("nikita-2209",
+ ergo(orig_ref != NULL,
+ reiser4_carry_real(reference) ==
+ reiser4_carry_real(orig_ref)));
+ return reiser4_add_carry(level, order, reference);
+}
+
+carry_node *reiser4_add_carry(carry_level * level, /* carry_level to add
+ node to */
+ pool_ordering order, /* where to insert:
+ * at the beginning of
+ * @level;
+ * before @reference;
+ * after @reference;
+ * at the end of @level
+ */
+ carry_node * reference /* reference node for
+ * insertion */)
+{
+ carry_node *result;
+
+ result =
+ (carry_node *) reiser4_add_obj(&level->pool->node_pool,
+ &level->nodes,
+ order, &reference->header);
+ if (!IS_ERR(result) && (result != NULL))
+ ++level->nodes_num;
+ return result;
+}
+
+/**
+ * add new carry operation to the @level.
+ *
+ * Returns pointer to the new carry operations allocated from pool. It's up to
+ * callers to maintain proper order in the @level. To control ordering use
+ * @order and @reference parameters.
+ */
+static carry_op *add_op(carry_level * level, /* &carry_level to add node to */
+ pool_ordering order, /* where to insert:
+ * at the beginning of @level;
+ * before @reference;
+ * after @reference;
+ * at the end of @level */
+ carry_op * reference /* reference node for insertion */)
+{
+ carry_op *result;
+
+ result =
+ (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
+ order, &reference->header);
+ if (!IS_ERR(result) && (result != NULL))
+ ++level->ops_num;
+ return result;
+}
+
+/**
+ * Return node on the right of which @node was created.
+ *
+ * Each node is created on the right of some existing node (or it is new root,
+ * which is special case not handled here).
+ *
+ * @node is new node created on some level, but not yet inserted into its
+ * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
+ */
+static carry_node *find_begetting_brother(carry_node * node,/* node to start
+ search from */
+ carry_level * kin UNUSED_ARG
+ /* level to scan */)
+{
+ carry_node *scan;
+
+ assert("nikita-1614", node != NULL);
+ assert("nikita-1615", kin != NULL);
+ assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
+ assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
+ ZF_ISSET(reiser4_carry_real(node),
+ JNODE_ORPHAN)));
+ for (scan = node;;
+ scan = list_entry(scan->header.level_linkage.prev, carry_node,
+ header.level_linkage)) {
+ assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
+ if ((scan->node != node->node) &&
+ !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
+ assert("nikita-1618", reiser4_carry_real(scan) != NULL);
+ break;
+ }
+ }
+ return scan;
+}
+
+static cmp_t
+carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
+{
+ assert("nikita-2199", n1 != NULL);
+ assert("nikita-2200", n2 != NULL);
+
+ if (n1 == n2)
+ return EQUAL_TO;
+ while (1) {
+ n1 = carry_node_next(n1);
+ if (carry_node_end(level, n1))
+ return GREATER_THAN;
+ if (n1 == n2)
+ return LESS_THAN;
+ }
+ impossible("nikita-2201", "End of level reached");
+}
+
+carry_node *find_carry_node(carry_level * level, const znode * node)
+{
+ carry_node *scan;
+ carry_node *tmp_scan;
+
+ assert("nikita-2202", level != NULL);
+ assert("nikita-2203", node != NULL);
+
+ for_all_nodes(level, scan, tmp_scan) {
+ if (reiser4_carry_real(scan) == node)
+ return scan;
+ }
+ return NULL;
+}
+
+znode *reiser4_carry_real(const carry_node * node)
+{
+ assert("nikita-3061", node != NULL);
+
+ return node->lock_handle.node;
+}
+
+carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
+ const znode * node)
+{
+ carry_node *base;
+ carry_node *scan;
+ carry_node *tmp_scan;
+ carry_node *proj;
+
+ base = find_carry_node(doing, node);
+ assert("nikita-2204", base != NULL);
+
+ for_all_nodes(todo, scan, tmp_scan) {
+ proj = find_carry_node(doing, scan->node);
+ assert("nikita-2205", proj != NULL);
+ if (carry_node_cmp(doing, proj, base) != LESS_THAN)
+ break;
+ }
+ return scan;
+}
+
+static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
+ znode * node)
+{
+ carry_node *reference;
+
+ assert("nikita-2994", doing != NULL);
+ assert("nikita-2995", todo != NULL);
+ assert("nikita-2996", node != NULL);
+
+ reference = insert_carry_node(doing, todo, node);
+ assert("nikita-2997", reference != NULL);
+
+ return reiser4_add_carry(todo, POOLO_BEFORE, reference);
+}
+
+/* like reiser4_post_carry(), but designed to be called from node plugin
+ methods. This function is different from reiser4_post_carry() in that it
+ finds proper place to insert node in the queue. */
+carry_op *node_post_carry(carry_plugin_info * info /* carry parameters
+ * passed down to node
+ * plugin */ ,
+ carry_opcode op /* opcode of operation */ ,
+ znode * node /* node on which this
+ * operation will operate */ ,
+ int apply_to_parent_p /* whether operation will
+ * operate directly on @node
+ * or on it parent. */ )
+{
+ carry_op *result;
+ carry_node *child;
+
+ assert("nikita-2207", info != NULL);
+ assert("nikita-2208", info->todo != NULL);
+
+ if (info->doing == NULL)
+ return reiser4_post_carry(info->todo, op, node,
+ apply_to_parent_p);
+
+ result = add_op(info->todo, POOLO_LAST, NULL);
+ if (IS_ERR(result))
+ return result;
+ child = add_carry_atplace(info->doing, info->todo, node);
+ if (IS_ERR(child)) {
+ reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
+ return (carry_op *) child;
+ }
+ result->node = child;
+ result->op = op;
+ child->parent = apply_to_parent_p;
+ if (ZF_ISSET(node, JNODE_ORPHAN))
+ child->left_before = 1;
+ child->node = node;
+ return result;
+}
+
+/* lock all carry nodes in @level */
+static int lock_carry_level(carry_level * level/* level to lock */)
+{
+ int result;
+ carry_node *node;
+ carry_node *tmp_node;
+
+ assert("nikita-881", level != NULL);
+ assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
+
+ /* lock nodes from left to right */
+ result = 0;
+ for_all_nodes(level, node, tmp_node) {
+ result = lock_carry_node(level, node);
+ if (result != 0)
+ break;
+ }
+ return result;
+}
+
+/* Synchronize delimiting keys between @node and its left neighbor.
+
+ To reduce contention on dk key and simplify carry code, we synchronize
+ delimiting keys only when carry ultimately leaves tree level (carrying
+ changes upward) and unlocks nodes at this level.
+
+ This function first finds left neighbor of @node and then updates left
+ neighbor's right delimiting key to conincide with least key in @node.
+
+*/
+
+ON_DEBUG(extern atomic_t delim_key_version;
+ )
+
+static void sync_dkeys(znode * spot/* node to update */)
+{
+ reiser4_key pivot;
+ reiser4_tree *tree;
+
+ assert("nikita-1610", spot != NULL);
+ assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
+
+ tree = znode_get_tree(spot);
+ read_lock_tree();
+ write_lock_dk(tree);
+
+ assert("nikita-2192", znode_is_loaded(spot));
+
+ /* sync left delimiting key of @spot with key in its leftmost item */
+ if (node_is_empty(spot))
+ pivot = *znode_get_rd_key(spot);
+ else
+ leftmost_key_in_node(spot, &pivot);
+
+ znode_set_ld_key(spot, &pivot);
+
+ /* there can be sequence of empty nodes pending removal on the left of
+ @spot. Scan them and update their left and right delimiting keys to
+ match left delimiting key of @spot. Also, update right delimiting
+ key of first non-empty left neighbor.
+ */
+ while (1) {
+ if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
+ break;
+
+ spot = spot->left;
+ if (spot == NULL)
+ break;
+
+ znode_set_rd_key(spot, &pivot);
+ /* don't sink into the domain of another balancing */
+ if (!znode_is_write_locked(spot))
+ break;
+ if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
+ znode_set_ld_key(spot, &pivot);
+ else
+ break;
+ }
+
+ write_unlock_dk(tree);
+ read_unlock_tree();
+}
+
+/* unlock all carry nodes in @level */
+static void unlock_carry_level(carry_level * level /* level to unlock */ ,
+ int failure /* true if unlocking owing to
+ * failure */ )
+{
+ carry_node *node;
+ carry_node *tmp_node;
+
+ assert("nikita-889", level != NULL);
+
+ if (!failure) {
+ znode *spot;
+
+ spot = NULL;
+ /* update delimiting keys */
+ for_all_nodes(level, node, tmp_node) {
+ if (reiser4_carry_real(node) != spot) {
+ spot = reiser4_carry_real(node);
+ sync_dkeys(spot);
+ }
+ }
+ }
+
+ /* nodes can be unlocked in arbitrary order. In preemptible
+ environment it's better to unlock in reverse order of locking,
+ though.
+ */
+ for_all_nodes_back(level, node, tmp_node) {
+ /* all allocated nodes should be already linked to their
+ parents at this moment. */
+ assert("nikita-1631",
+ ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
+ JNODE_ORPHAN)));
+ ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
+ unlock_carry_node(level, node, failure);
+ }
+ level->new_root = NULL;
+}
+
+/* finish with @level
+
+ Unlock nodes and release all allocated resources */
+static void done_carry_level(carry_level * level/* level to finish */)
+{
+ carry_node *node;
+ carry_node *tmp_node;
+ carry_op *op;
+ carry_op *tmp_op;
+
+ assert("nikita-1076", level != NULL);
+
+ unlock_carry_level(level, 0);
+ for_all_nodes(level, node, tmp_node) {
+ assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
+ assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
+ reiser4_pool_free(&level->pool->node_pool, &node->header);
+ }
+ for_all_ops(level, op, tmp_op)
+ reiser4_pool_free(&level->pool->op_pool, &op->header);
+}
+
+/* helper function to complete locking of carry node
+
+ Finish locking of carry node. There are several ways in which new carry
+ node can be added into carry level and locked. Normal is through
+ lock_carry_node(), but also from find_{left|right}_neighbor(). This
+ function factors out common final part of all locking scenarios. It
+ supposes that @node -> lock_handle is lock handle for lock just taken and
+ fills ->real_node from this lock handle.
+
+*/
+int lock_carry_node_tail(carry_node * node/* node to complete locking of */)
+{
+ assert("nikita-1052", node != NULL);
+ assert("nikita-1187", reiser4_carry_real(node) != NULL);
+ assert("nikita-1188", !node->unlock);
+
+ node->unlock = 1;
+ /* Load node content into memory and install node plugin by
+ looking at the node header.
+
+ Most of the time this call is cheap because the node is
+ already in memory.
+
+ Corresponding zrelse() is in unlock_carry_node()
+ */
+ return zload(reiser4_carry_real(node));
+}
+
+/* lock carry node
+
+ "Resolve" node to real znode, lock it and mark as locked.
+ This requires recursive locking of znodes.
+
+ When operation is posted to the parent level, node it will be applied to is
+ not yet known. For example, when shifting data between two nodes,
+ delimiting has to be updated in parent or parents of nodes involved. But
+ their parents is not yet locked and, moreover said nodes can be reparented
+ by concurrent balancing.
+
+ To work around this, carry operation is applied to special "carry node"
+ rather than to the znode itself. Carry node consists of some "base" or
+ "reference" znode and flags indicating how to get to the target of carry
+ operation (->real_node field of carry_node) from base.
+
+*/
+int lock_carry_node(carry_level * level /* level @node is in */ ,
+ carry_node * node/* node to lock */)
+{
+ int result;
+ znode *reference_point;
+ lock_handle lh;
+ lock_handle tmp_lh;
+ reiser4_tree *tree;
+
+ assert("nikita-887", level != NULL);
+ assert("nikita-882", node != NULL);
+
+ result = 0;
+ reference_point = node->node;
+ init_lh(&lh);
+ init_lh(&tmp_lh);
+ if (node->left_before) {
+ /* handling of new nodes, allocated on the previous level:
+
+ some carry ops were propably posted from the new node, but
+ this node neither has parent pointer set, nor is
+ connected. This will be done in ->create_hook() for
+ internal item.
+
+ No then less, parent of new node has to be locked. To do
+ this, first go to the "left" in the carry order. This
+ depends on the decision to always allocate new node on the
+ right of existing one.
+
+ Loop handles case when multiple nodes, all orphans, were
+ inserted.
+
+ Strictly speaking, taking tree lock is not necessary here,
+ because all nodes scanned by loop in
+ find_begetting_brother() are write-locked by this thread,
+ and thus, their sibling linkage cannot change.
+
+ */
+ tree = znode_get_tree(reference_point);
+ read_lock_tree();
+ reference_point = find_begetting_brother(node, level)->node;
+ read_unlock_tree();
+ assert("nikita-1186", reference_point != NULL);
+ }
+ if (node->parent && (result == 0)) {
+ result =
+ reiser4_get_parent(&tmp_lh, reference_point,
+ ZNODE_WRITE_LOCK);
+ if (result != 0) {
+ ; /* nothing */
+ } else if (znode_get_level(tmp_lh.node) == 0) {
+ assert("nikita-1347", znode_above_root(tmp_lh.node));
+ result = add_new_root(level, node, tmp_lh.node);
+ if (result == 0) {
+ reference_point = level->new_root;
+ move_lh(&lh, &node->lock_handle);
+ }
+ } else if ((level->new_root != NULL)
+ && (level->new_root !=
+ znode_parent_nolock(reference_point))) {
+ /* parent of node exists, but this level aready
+ created different new root, so */
+ warning("nikita-1109",
+ /* it should be "radicis", but tradition is
+ tradition. do banshees read latin? */
+ "hodie natus est radici frater");
+ result = -EIO;
+ } else {
+ move_lh(&lh, &tmp_lh);
+ reference_point = lh.node;
+ }
+ }
+ if (node->left && (result == 0)) {
+ assert("nikita-1183", node->parent);
+ assert("nikita-883", reference_point != NULL);
+ result =
+ reiser4_get_left_neighbor(&tmp_lh, reference_point,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result == 0) {
+ done_lh(&lh);
+ move_lh(&lh, &tmp_lh);
+ reference_point = lh.node;
+ }
+ }
+ if (!node->parent && !node->left && !node->left_before) {
+ result =
+ longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_HIPRI);
+ }
+ if (result == 0) {
+ move_lh(&node->lock_handle, &lh);
+ result = lock_carry_node_tail(node);
+ }
+ done_lh(&tmp_lh);
+ done_lh(&lh);
+ return result;
+}
+
+/* release a lock on &carry_node.
+
+ Release if necessary lock on @node. This opearion is pair of
+ lock_carry_node() and is idempotent: you can call it more than once on the
+ same node.
+
+*/
+static void
+unlock_carry_node(carry_level * level,
+ carry_node * node /* node to be released */ ,
+ int failure /* 0 if node is unlocked due
+ * to some error */ )
+{
+ znode *real_node;
+
+ assert("nikita-884", node != NULL);
+
+ real_node = reiser4_carry_real(node);
+ /* pair to zload() in lock_carry_node_tail() */
+ zrelse(real_node);
+ if (node->unlock && (real_node != NULL)) {
+ assert("nikita-899", real_node == node->lock_handle.node);
+ longterm_unlock_znode(&node->lock_handle);
+ }
+ if (failure) {
+ if (node->deallocate && (real_node != NULL)) {
+ /* free node in bitmap
+
+ Prepare node for removal. Last zput() will finish
+ with it.
+ */
+ ZF_SET(real_node, JNODE_HEARD_BANSHEE);
+ }
+ if (node->free) {
+ assert("nikita-2177",
+ list_empty_careful(&node->lock_handle.locks_link));
+ assert("nikita-2112",
+ list_empty_careful(&node->lock_handle.owners_link));
+ reiser4_pool_free(&level->pool->node_pool,
+ &node->header);
+ }
+ }
+}
+
+/* fatal_carry_error() - all-catching error handling function
+
+ It is possible that carry faces unrecoverable error, like unability to
+ insert pointer at the internal level. Our simple solution is just panic in
+ this situation. More sophisticated things like attempt to remount
+ file-system as read-only can be implemented without much difficlties.
+
+ It is believed, that:
+
+ 1. in stead of panicking, all current transactions can be aborted rolling
+ system back to the consistent state.
+
+Umm, if you simply panic without doing anything more at all, then all current
+transactions are aborted and the system is rolled back to a consistent state,
+by virtue of the design of the transactional mechanism. Well, wait, let's be
+precise. If an internal node is corrupted on disk due to hardware failure,
+then there may be no consistent state that can be rolled back to, so instead
+we should say that it will rollback the transactions, which barring other
+factors means rolling back to a consistent state.
+
+# Nikita: there is a subtle difference between panic and aborting
+# transactions: machine doesn't reboot. Processes aren't killed. Processes
+# don't using reiser4 (not that we care about such processes), or using other
+# reiser4 mounts (about them we do care) will simply continue to run. With
+# some luck, even application using aborted file system can survive: it will
+# get some error, like EBADF, from each file descriptor on failed file system,
+# but applications that do care about tolerance will cope with this (squid
+# will).
+
+It would be a nice feature though to support rollback without rebooting
+followed by remount, but this can wait for later versions.
+
+ 2. once isolated transactions will be implemented it will be possible to
+ roll back offending transaction.
+
+2. is additional code complexity of inconsistent value (it implies that a
+broken tree should be kept in operation), so we must think about it more
+before deciding if it should be done. -Hans
+
+*/
+static void fatal_carry_error(carry_level * doing UNUSED_ARG /* carry level
+ * where
+ * unrecoverable
+ * error
+ * occurred */ ,
+ int ecode/* error code */)
+{
+ assert("nikita-1230", doing != NULL);
+ assert("nikita-1231", ecode < 0);
+
+ reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
+}
+
+/**
+ * Add new root to the tree
+ *
+ * This function itself only manages changes in carry structures and delegates
+ * all hard work (allocation of znode for new root, changes of parent and
+ * sibling pointers) to the reiser4_add_tree_root().
+ *
+ * Locking: old tree root is locked by carry at this point. Fake znode is also
+ * locked.
+ */
+static int add_new_root(carry_level * level,/* carry level in context of which
+ * operation is performed */
+ carry_node * node, /* carry node for existing root */
+ znode * fake /* "fake" znode already locked by
+ * us */)
+{
+ int result;
+
+ assert("nikita-1104", level != NULL);
+ assert("nikita-1105", node != NULL);
+
+ assert("nikita-1403", znode_is_write_locked(node->node));
+ assert("nikita-1404", znode_is_write_locked(fake));
+
+ /* trying to create new root. */
+ /* @node is root and it's already locked by us. This
+ means that nobody else can be trying to add/remove
+ tree root right now.
+ */
+ if (level->new_root == NULL)
+ level->new_root = reiser4_add_tree_root(node->node, fake);
+ if (!IS_ERR(level->new_root)) {
+ assert("nikita-1210", znode_is_root(level->new_root));
+ node->deallocate = 1;
+ result =
+ longterm_lock_znode(&node->lock_handle, level->new_root,
+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
+ if (result == 0)
+ zput(level->new_root);
+ } else {
+ result = PTR_ERR(level->new_root);
+ level->new_root = NULL;
+ }
+ return result;
+}
+
+/* allocate new znode and add the operation that inserts the
+ pointer to it into the parent node into the todo level
+
+ Allocate new znode, add it into carry queue and post into @todo queue
+ request to add pointer to new node into its parent.
+
+ This is carry related routing that calls reiser4_new_node() to allocate new
+ node.
+*/
+carry_node *add_new_znode(znode * brother /* existing left neighbor of new
+ * node */ ,
+ carry_node * ref /* carry node after which new
+ * carry node is to be inserted
+ * into queue. This affects
+ * locking. */ ,
+ carry_level * doing /* carry queue where new node is
+ * to be added */ ,
+ carry_level * todo /* carry queue where COP_INSERT
+ * operation to add pointer to
+ * new node will ne added */ )
+{
+ carry_node *fresh;
+ znode *new_znode;
+ carry_op *add_pointer;
+ carry_plugin_info info;
+
+ assert("nikita-1048", brother != NULL);
+ assert("nikita-1049", todo != NULL);
+
+ /* There is a lot of possible variations here: to what parent
+ new node will be attached and where. For simplicity, always
+ do the following:
+
+ (1) new node and @brother will have the same parent.
+
+ (2) new node is added on the right of @brother
+
+ */
+
+ fresh = reiser4_add_carry_skip(doing,
+ ref ? POOLO_AFTER : POOLO_LAST, ref);
+ if (IS_ERR(fresh))
+ return fresh;
+
+ fresh->deallocate = 1;
+ fresh->free = 1;
+
+ new_znode = reiser4_new_node(brother, znode_get_level(brother));
+ if (IS_ERR(new_znode))
+ /* @fresh will be deallocated automatically by error
+ handling code in the caller. */
+ return (carry_node *) new_znode;
+
+ /* new_znode returned znode with x_count 1. Caller has to decrease
+ it. make_space() does. */
+
+ ZF_SET(new_znode, JNODE_ORPHAN);
+ fresh->node = new_znode;
+
+ while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
+ ref = carry_node_prev(ref);
+ assert("nikita-1606", !carry_node_end(doing, ref));
+ }
+
+ info.todo = todo;
+ info.doing = doing;
+ add_pointer = node_post_carry(&info, COP_INSERT,
+ reiser4_carry_real(ref), 1);
+ if (IS_ERR(add_pointer)) {
+ /* no need to deallocate @new_znode here: it will be
+ deallocated during carry error handling. */
+ return (carry_node *) add_pointer;
+ }
+
+ add_pointer->u.insert.type = COPT_CHILD;
+ add_pointer->u.insert.child = fresh;
+ add_pointer->u.insert.brother = brother;
+ /* initially new node spawns empty key range */
+ write_lock_dk(znode_get_tree(brother));
+ znode_set_ld_key(new_znode,
+ znode_set_rd_key(new_znode,
+ znode_get_rd_key(brother)));
+ write_unlock_dk(znode_get_tree(brother));
+ return fresh;
+}
+
+/* DEBUGGING FUNCTIONS.
+
+ Probably we also should leave them on even when
+ debugging is turned off to print dumps at errors.
+*/
+#if REISER4_DEBUG
+static int carry_level_invariant(carry_level * level, carry_queue_state state)
+{
+ carry_node *node;
+ carry_node *tmp_node;
+
+ if (level == NULL)
+ return 0;
+
+ if (level->track_type != 0 &&
+ level->track_type != CARRY_TRACK_NODE &&
+ level->track_type != CARRY_TRACK_CHANGE)
+ return 0;
+
+ /* check that nodes are in ascending order */
+ for_all_nodes(level, node, tmp_node) {
+ znode *left;
+ znode *right;
+
+ reiser4_key lkey;
+ reiser4_key rkey;
+
+ if (node != carry_node_front(level)) {
+ if (state == CARRY_TODO) {
+ right = node->node;
+ left = carry_node_prev(node)->node;
+ } else {
+ right = reiser4_carry_real(node);
+ left = reiser4_carry_real(carry_node_prev(node));
+ }
+ if (right == NULL || left == NULL)
+ continue;
+ if (node_is_empty(right) || node_is_empty(left))
+ continue;
+ if (!keyle(leftmost_key_in_node(left, &lkey),
+ leftmost_key_in_node(right, &rkey))) {
+ warning("", "wrong key order");
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+#endif
+
+/* get symbolic name for boolean */
+static const char *tf(int boolean/* truth value */)
+{
+ return boolean ? "t" : "f";
+}
+
+/* symbolic name for carry operation */
+static const char *carry_op_name(carry_opcode op/* carry opcode */)
+{
+ switch (op) {
+ case COP_INSERT:
+ return "COP_INSERT";
+ case COP_DELETE:
+ return "COP_DELETE";
+ case COP_CUT:
+ return "COP_CUT";
+ case COP_PASTE:
+ return "COP_PASTE";
+ case COP_UPDATE:
+ return "COP_UPDATE";
+ case COP_EXTENT:
+ return "COP_EXTENT";
+ case COP_INSERT_FLOW:
+ return "COP_INSERT_FLOW";
+ default:{
+ /* not mt safe, but who cares? */
+ static char buf[20];
+
+ sprintf(buf, "unknown op: %x", op);
+ return buf;
+ }
+ }
+}
+
+/* dump information about carry node */
+static void print_carry(const char *prefix /* prefix to print */ ,
+ carry_node * node/* node to print */)
+{
+ if (node == NULL) {
+ printk("%s: null\n", prefix);
+ return;
+ }
+ printk
+ ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
+ prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
+ tf(node->free), tf(node->deallocate));
+}
+
+/* dump information about carry operation */
+static void print_op(const char *prefix /* prefix to print */ ,
+ carry_op * op/* operation to print */)
+{
+ if (op == NULL) {
+ printk("%s: null\n", prefix);
+ return;
+ }
+ printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
+ print_carry("\tnode", op->node);
+ switch (op->op) {
+ case COP_INSERT:
+ case COP_PASTE:
+ print_coord("\tcoord",
+ op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
+ reiser4_print_key("\tkey",
+ op->u.insert.d ? op->u.insert.d->key : NULL);
+ print_carry("\tchild", op->u.insert.child);
+ break;
+ case COP_DELETE:
+ print_carry("\tchild", op->u.delete.child);
+ break;
+ case COP_CUT:
+ if (op->u.cut_or_kill.is_cut) {
+ print_coord("\tfrom",
+ op->u.cut_or_kill.u.kill->params.from, 0);
+ print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
+ 0);
+ } else {
+ print_coord("\tfrom",
+ op->u.cut_or_kill.u.cut->params.from, 0);
+ print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
+ 0);
+ }
+ break;
+ case COP_UPDATE:
+ print_carry("\tleft", op->u.update.left);
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+}
+
+/* dump information about all nodes and operations in a @level */
+static void print_level(const char *prefix /* prefix to print */ ,
+ carry_level * level/* level to print */)
+{
+ carry_node *node;
+ carry_node *tmp_node;
+ carry_op *op;
+ carry_op *tmp_op;
+
+ if (level == NULL) {
+ printk("%s: null\n", prefix);
+ return;
+ }
+ printk("%s: %p, restartable: %s\n",
+ prefix, level, tf(level->restartable));
+
+ for_all_nodes(level, node, tmp_node)
+ print_carry("\tcarry node", node);
+ for_all_ops(level, op, tmp_op)
+ print_op("\tcarry op", op);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/carry.h linux-5.10.2/fs/reiser4/carry.h
--- linux-5.10.2.orig/fs/reiser4/carry.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/carry.h 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,445 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Functions and data types to "carry" tree modification(s) upward.
+ See fs/reiser4/carry.c for details. */
+
+#if !defined(__FS_REISER4_CARRY_H__)
+#define __FS_REISER4_CARRY_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "pool.h"
+#include "znode.h"
+
+#include <linux/types.h>
+
+/* &carry_node - "location" of carry node.
+
+ "location" of node that is involved or going to be involved into
+ carry process. Node where operation will be carried to on the
+ parent level cannot be recorded explicitly. Operation will be carried
+ usually to the parent of some node (where changes are performed at
+ the current level) or, to the left neighbor of its parent. But while
+ modifications are performed at the current level, parent may
+ change. So, we have to allow some indirection (or, positevly,
+ flexibility) in locating carry nodes.
+
+*/
+typedef struct carry_node {
+ /* pool linkage */
+ struct reiser4_pool_header header;
+
+ /* base node from which real_node is calculated. See
+ fs/reiser4/carry.c:lock_carry_node(). */
+ znode *node;
+
+ /* how to get ->real_node */
+ /* to get ->real_node obtain parent of ->node */
+ __u32 parent:1;
+ /* to get ->real_node obtain left neighbor of parent of
+ ->node */
+ __u32 left:1;
+ __u32 left_before:1;
+
+ /* locking */
+
+ /* this node was locked by carry process and should be
+ unlocked when carry leaves a level */
+ __u32 unlock:1;
+
+ /* disk block for this node was allocated by carry process and
+ should be deallocated when carry leaves a level */
+ __u32 deallocate:1;
+ /* this carry node was allocated by carry process and should be
+ freed when carry leaves a level */
+ __u32 free:1;
+
+ /* type of lock we want to take on this node */
+ lock_handle lock_handle;
+} carry_node;
+
+/* &carry_opcode - elementary operations that can be carried upward
+
+ Operations that carry() can handle. This list is supposed to be
+ expanded.
+
+ Each carry operation (cop) is handled by appropriate function defined
+ in fs/reiser4/carry.c. For example COP_INSERT is handled by
+ fs/reiser4/carry.c:carry_insert() etc. These functions in turn
+ call plugins of nodes affected by operation to modify nodes' content
+ and to gather operations to be performed on the next level.
+
+*/
+typedef enum {
+ /* insert new item into node. */
+ COP_INSERT,
+ /* delete pointer from parent node */
+ COP_DELETE,
+ /* remove part of or whole node. */
+ COP_CUT,
+ /* increase size of item. */
+ COP_PASTE,
+ /* insert extent (that is sequence of unformatted nodes). */
+ COP_EXTENT,
+ /* update delimiting key in least common ancestor of two
+ nodes. This is performed when items are moved between two
+ nodes.
+ */
+ COP_UPDATE,
+ /* insert flow */
+ COP_INSERT_FLOW,
+ COP_LAST_OP,
+} carry_opcode;
+
+#define CARRY_FLOW_NEW_NODES_LIMIT 20
+
+/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
+ item is determined. */
+typedef enum {
+ /* target item is one containing pointer to the ->child node */
+ COPT_CHILD,
+ /* target item is given explicitly by @coord */
+ COPT_ITEM_DATA,
+ /* target item is given by key */
+ COPT_KEY,
+ /* see insert_paste_common() for more comments on this. */
+ COPT_PASTE_RESTARTED,
+} cop_insert_pos_type;
+
+/* flags to cut and delete */
+typedef enum {
+ /* don't kill node even if it became completely empty as results of
+ * cut. This is needed for eottl handling. See carry_extent() for
+ * details. */
+ DELETE_RETAIN_EMPTY = (1 << 0)
+} cop_delete_flag;
+
+/*
+ * carry() implements "lock handle tracking" feature.
+ *
+ * Callers supply carry with node where to perform initial operation and lock
+ * handle on this node. Trying to optimize node utilization carry may actually
+ * move insertion point to different node. Callers expect that lock handle
+ * will rebe transferred to the new node also.
+ *
+ */
+typedef enum {
+ /* transfer lock handle along with insertion point */
+ CARRY_TRACK_CHANGE = 1,
+ /* acquire new lock handle to the node where insertion point is. This
+ * is used when carry() client doesn't initially possess lock handle
+ * on the insertion point node, for example, by extent insertion
+ * code. See carry_extent(). */
+ CARRY_TRACK_NODE = 2
+} carry_track_type;
+
+/* data supplied to COP_{INSERT|PASTE} by callers */
+typedef struct carry_insert_data {
+ /* position where new item is to be inserted */
+ coord_t *coord;
+ /* new item description */
+ reiser4_item_data * data;
+ /* key of new item */
+ const reiser4_key * key;
+} carry_insert_data;
+
+/* cut and kill are similar, so carry_cut_data and carry_kill_data share the
+ below structure of parameters */
+struct cut_kill_params {
+ /* coord where cut starts (inclusive) */
+ coord_t *from;
+ /* coord where cut stops (inclusive, this item/unit will also be
+ * cut) */
+ coord_t *to;
+ /* starting key. This is necessary when item and unit pos don't
+ * uniquely identify what portion or tree to remove. For example, this
+ * indicates what portion of extent unit will be affected. */
+ const reiser4_key * from_key;
+ /* exclusive stop key */
+ const reiser4_key * to_key;
+ /* if this is not NULL, smallest actually removed key is stored
+ * here. */
+ reiser4_key *smallest_removed;
+ /* kill_node_content() is called for file truncate */
+ int truncate;
+};
+
+struct carry_cut_data {
+ struct cut_kill_params params;
+};
+
+struct carry_kill_data {
+ struct cut_kill_params params;
+ /* parameter to be passed to the ->kill_hook() method of item
+ * plugin */
+ /*void *iplug_params; *//* FIXME: unused currently */
+ /* if not NULL---inode whose items are being removed. This is needed
+ * for ->kill_hook() of extent item to update VM structures when
+ * removing pages. */
+ struct inode *inode;
+ /* sibling list maintenance is complicated by existence of eottl. When
+ * eottl whose left and right neighbors are formatted leaves is
+ * removed, one has to connect said leaves in the sibling list. This
+ * cannot be done when extent removal is just started as locking rules
+ * require sibling list update to happen atomically with removal of
+ * extent item. Therefore: 1. pointers to left and right neighbors
+ * have to be passed down to the ->kill_hook() of extent item, and
+ * 2. said neighbors have to be locked. */
+ lock_handle *left;
+ lock_handle *right;
+ /* flags modifying behavior of kill. Currently, it may have
+ DELETE_RETAIN_EMPTY set. */
+ unsigned flags;
+ char *buf;
+};
+
+/* &carry_tree_op - operation to "carry" upward.
+
+ Description of an operation we want to "carry" to the upper level of
+ a tree: e.g, when we insert something and there is not enough space
+ we allocate a new node and "carry" the operation of inserting a
+ pointer to the new node to the upper level, on removal of empty node,
+ we carry up operation of removing appropriate entry from parent.
+
+ There are two types of carry ops: when adding or deleting node we
+ node at the parent level where appropriate modification has to be
+ performed is known in advance. When shifting items between nodes
+ (split, merge), delimiting key should be changed in the least common
+ parent of the nodes involved that is not known in advance.
+
+ For the operations of the first type we store in &carry_op pointer to
+ the &carry_node at the parent level. For the operation of the second
+ type we store &carry_node or parents of the left and right nodes
+ modified and keep track of them upward until they coincide.
+
+*/
+typedef struct carry_op {
+ /* pool linkage */
+ struct reiser4_pool_header header;
+ carry_opcode op;
+ /* node on which operation is to be performed:
+
+ for insert, paste: node where new item is to be inserted
+
+ for delete: node where pointer is to be deleted
+
+ for cut: node to cut from
+
+ for update: node where delimiting key is to be modified
+
+ for modify: parent of modified node
+
+ */
+ carry_node *node;
+ union {
+ struct {
+ /* (sub-)type of insertion/paste. Taken from
+ cop_insert_pos_type. */
+ __u8 type;
+ /* various operation flags. Taken from
+ cop_insert_flag. */
+ __u8 flags;
+ carry_insert_data *d;
+ carry_node *child;
+ znode *brother;
+ } insert, paste, extent;
+
+ struct {
+ int is_cut;
+ union {
+ carry_kill_data *kill;
+ carry_cut_data *cut;
+ } u;
+ } cut_or_kill;
+
+ struct {
+ carry_node *left;
+ } update;
+ struct {
+ /* changed child */
+ carry_node *child;
+ /* bitmask of changes. See &cop_modify_flag */
+ __u32 flag;
+ } modify;
+ struct {
+ /* flags to deletion operation. Are taken from
+ cop_delete_flag */
+ __u32 flags;
+ /* child to delete from parent. If this is
+ NULL, delete op->node. */
+ carry_node *child;
+ } delete;
+ struct {
+ /* various operation flags. Taken from
+ cop_insert_flag. */
+ __u32 flags;
+ flow_t *flow;
+ coord_t *insert_point;
+ reiser4_item_data *data;
+ /* flow insertion is limited by number of new blocks
+ added in that operation which do not get any data
+ but part of flow. This limit is set by macro
+ CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
+ of nodes added already during one carry_flow */
+ int new_nodes;
+ } insert_flow;
+ } u;
+} carry_op;
+
+/* &carry_op_pool - preallocated pool of carry operations, and nodes */
+typedef struct carry_pool {
+ carry_op op[CARRIES_POOL_SIZE];
+ struct reiser4_pool op_pool;
+ carry_node node[NODES_LOCKED_POOL_SIZE];
+ struct reiser4_pool node_pool;
+} carry_pool;
+
+/* &carry_tree_level - carry process on given level
+
+ Description of balancing process on the given level.
+
+ No need for locking here, as carry_tree_level is essentially per
+ thread thing (for now).
+
+*/
+struct carry_level {
+ /* this level may be restarted */
+ __u32 restartable:1;
+ /* list of carry nodes on this level, ordered by key order */
+ struct list_head nodes;
+ struct list_head ops;
+ /* pool where new objects are allocated from */
+ carry_pool *pool;
+ int ops_num;
+ int nodes_num;
+ /* new root created on this level, if any */
+ znode *new_root;
+ /* This is set by caller (insert_by_key(), reiser4_resize_item(), etc.)
+ when they want ->tracked to automagically wander to the node where
+ insertion point moved after insert or paste.
+ */
+ carry_track_type track_type;
+ /* lock handle supplied by user that we are tracking. See
+ above. */
+ lock_handle *tracked;
+};
+
+/* information carry passes to plugin methods that may add new operations to
+ the @todo queue */
+struct carry_plugin_info {
+ carry_level *doing;
+ carry_level *todo;
+};
+
+int reiser4_carry(carry_level * doing, carry_level * done);
+
+carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
+ carry_node * reference);
+carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
+ carry_node * reference);
+
+extern carry_node *insert_carry_node(carry_level * doing,
+ carry_level * todo, const znode * node);
+
+extern carry_pool *init_carry_pool(int);
+extern void done_carry_pool(carry_pool * pool);
+
+extern void init_carry_level(carry_level * level, carry_pool * pool);
+
+extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
+ znode * node, int apply_to_parent);
+extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
+ znode * node, int apply_to_parent_p);
+
+carry_node *add_new_znode(znode * brother, carry_node * reference,
+ carry_level * doing, carry_level * todo);
+
+carry_node *find_carry_node(carry_level * level, const znode * node);
+
+extern znode *reiser4_carry_real(const carry_node * node);
+
+/* helper macros to iterate over carry queues */
+
+#define carry_node_next(node) \
+ list_entry((node)->header.level_linkage.next, carry_node, \
+ header.level_linkage)
+
+#define carry_node_prev(node) \
+ list_entry((node)->header.level_linkage.prev, carry_node, \
+ header.level_linkage)
+
+#define carry_node_front(level) \
+ list_entry((level)->nodes.next, carry_node, header.level_linkage)
+
+#define carry_node_back(level) \
+ list_entry((level)->nodes.prev, carry_node, header.level_linkage)
+
+#define carry_node_end(level, node) \
+ (&(level)->nodes == &(node)->header.level_linkage)
+
+/* macro to iterate over all operations in a @level */
+#define for_all_ops(level /* carry level (of type carry_level *) */, \
+ op /* pointer to carry operation, modified by loop (of \
+ * type carry_op *) */, \
+ tmp /* pointer to carry operation (of type carry_op *), \
+ * used to make iterator stable in the face of \
+ * deletions from the level */ ) \
+for (op = list_entry(level->ops.next, carry_op, header.level_linkage), \
+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage); \
+ &op->header.level_linkage != &level->ops; \
+ op = tmp, \
+ tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
+
+#if 0
+for (op = (carry_op *) pool_level_list_front(&level->ops), \
+ tmp = (carry_op *) pool_level_list_next(&op->header) ; \
+ !pool_level_list_end(&level->ops, &op->header) ; \
+ op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header))
+#endif
+
+/* macro to iterate over all nodes in a @level */ \
+#define for_all_nodes(level /* carry level (of type carry_level *) */, \
+ node /* pointer to carry node, modified by loop (of \
+ * type carry_node *) */, \
+ tmp /* pointer to carry node (of type carry_node *), \
+ * used to make iterator stable in the face of * \
+ * deletions from the level */ ) \
+for (node = list_entry(level->nodes.next, carry_node, header.level_linkage), \
+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
+ &node->header.level_linkage != &level->nodes; \
+ node = tmp, \
+ tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
+
+#if 0
+for (node = carry_node_front(level), \
+ tmp = carry_node_next(node) ; !carry_node_end(level, node) ; \
+ node = tmp, tmp = carry_node_next(node))
+#endif
+
+/* macro to iterate over all nodes in a @level in reverse order
+
+ This is used, because nodes are unlocked in reversed order of locking */
+#define for_all_nodes_back(level /* carry level (of type carry_level *) */, \
+ node /* pointer to carry node, modified by loop \
+ * (of type carry_node *) */, \
+ tmp /* pointer to carry node (of type carry_node \
+ * *), used to make iterator stable in the \
+ * face of deletions from the level */ ) \
+for (node = carry_node_back(level), \
+ tmp = carry_node_prev(node) ; !carry_node_end(level, node) ; \
+ node = tmp, tmp = carry_node_prev(node))
+
+/* __FS_REISER4_CARRY_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/carry_ops.c linux-5.10.2/fs/reiser4/carry_ops.c
--- linux-5.10.2.orig/fs/reiser4/carry_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/carry_ops.c 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,2159 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* implementation of carry operations */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "pool.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "tree.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node,
+ carry_level * doing, carry_level * todo,
+ unsigned int including_insert_coord_p);
+
+extern int lock_carry_node(carry_level * level, carry_node * node);
+extern int lock_carry_node_tail(carry_node * node);
+
+/* find left neighbor of a carry node
+
+ Look for left neighbor of @node and add it to the @doing queue. See
+ comments in the body.
+
+*/
+static carry_node *find_left_neighbor(carry_op * op /* node to find left
+ * neighbor of */ ,
+ carry_level * doing/* level to scan */)
+{
+ int result;
+ carry_node *node;
+ carry_node *left;
+ int flags;
+ reiser4_tree *tree;
+
+ node = op->node;
+ tree = znode_get_tree(node->node);
+
+ read_lock_tree();
+ /* first, check whether left neighbor is already in a @doing queue */
+ if (reiser4_carry_real(node)->left != NULL) {
+ /* NOTE: there is locking subtlety here. Look into
+ * find_right_neighbor() for more info */
+ if (find_carry_node(doing,
+ reiser4_carry_real(node)->left) != NULL) {
+ read_unlock_tree();
+ left = node;
+ do {
+ left = list_entry(left->header.level_linkage.prev,
+ carry_node, header.level_linkage);
+ assert("nikita-3408", !carry_node_end(doing,
+ left));
+ } while (reiser4_carry_real(left) ==
+ reiser4_carry_real(node));
+ return left;
+ }
+ }
+ read_unlock_tree();
+
+ left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
+ if (IS_ERR(left))
+ return left;
+
+ left->node = node->node;
+ left->free = 1;
+
+ flags = GN_TRY_LOCK;
+ if (!(op->u.insert.flags & COPI_LOAD_LEFT))
+ flags |= GN_NO_ALLOC;
+
+ /* then, feeling lucky, peek left neighbor in the cache. */
+ result = reiser4_get_left_neighbor(&left->lock_handle,
+ reiser4_carry_real(node),
+ ZNODE_WRITE_LOCK, flags);
+ if (result == 0) {
+ /* ok, node found and locked. */
+ result = lock_carry_node_tail(left);
+ if (result != 0)
+ left = ERR_PTR(result);
+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
+ /* node is leftmost node in a tree, or neighbor wasn't in
+ cache, or there is an extent on the left. */
+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
+ left = NULL;
+ } else if (doing->restartable) {
+ /* if left neighbor is locked, and level is restartable, add
+ new node to @doing and restart. */
+ assert("nikita-913", node->parent != 0);
+ assert("nikita-914", node->node != NULL);
+ left->left = 1;
+ left->free = 0;
+ left = ERR_PTR(-E_REPEAT);
+ } else {
+ /* left neighbor is locked, level cannot be restarted. Just
+ ignore left neighbor. */
+ reiser4_pool_free(&doing->pool->node_pool, &left->header);
+ left = NULL;
+ }
+ return left;
+}
+
+/* find right neighbor of a carry node
+
+ Look for right neighbor of @node and add it to the @doing queue. See
+ comments in the body.
+
+*/
+static carry_node *find_right_neighbor(carry_op * op /* node to find right
+ * neighbor of */ ,
+ carry_level * doing/* level to scan */)
+{
+ int result;
+ carry_node *node;
+ carry_node *right;
+ lock_handle lh;
+ int flags;
+ reiser4_tree *tree;
+
+ init_lh(&lh);
+
+ node = op->node;
+ tree = znode_get_tree(node->node);
+
+ read_lock_tree();
+ /* first, check whether right neighbor is already in a @doing queue */
+ if (reiser4_carry_real(node)->right != NULL) {
+ /*
+ * Tree lock is taken here anyway, because, even if _outcome_
+ * of (find_carry_node() != NULL) doesn't depends on
+ * concurrent updates to ->right, find_carry_node() cannot
+ * work with second argument NULL. Hence, following comment is
+ * of historic importance only.
+ *
+ * Subtle:
+ *
+ * Q: why don't we need tree lock here, looking for the right
+ * neighbor?
+ *
+ * A: even if value of node->real_node->right were changed
+ * during find_carry_node() execution, outcome of execution
+ * wouldn't change, because (in short) other thread cannot add
+ * elements to the @doing, and if node->real_node->right
+ * already was in @doing, value of node->real_node->right
+ * couldn't change, because node cannot be inserted between
+ * locked neighbors.
+ */
+ if (find_carry_node(doing,
+ reiser4_carry_real(node)->right) != NULL) {
+ read_unlock_tree();
+ /*
+ * What we are doing here (this is also applicable to
+ * the find_left_neighbor()).
+ *
+ * tree_walk.c code requires that insertion of a
+ * pointer to a child, modification of parent pointer
+ * in the child, and insertion of the child into
+ * sibling list are atomic (see
+ * plugin/item/internal.c:create_hook_internal()).
+ *
+ * carry allocates new node long before pointer to it
+ * is inserted into parent and, actually, long before
+ * parent is even known. Such allocated-but-orphaned
+ * nodes are only trackable through carry level lists.
+ *
+ * Situation that is handled here is following: @node
+ * has valid ->right pointer, but there is
+ * allocated-but-orphaned node in the carry queue that
+ * is logically between @node and @node->right. Here
+ * we are searching for it. Critical point is that
+ * this is only possible if @node->right is also in
+ * the carry queue (this is checked above), because
+ * this is the only way new orphaned node could be
+ * inserted between them (before inserting new node,
+ * make_space() first tries to shift to the right, so,
+ * right neighbor will be locked and queued).
+ *
+ */
+ right = node;
+ do {
+ right = list_entry(right->header.level_linkage.next,
+ carry_node, header.level_linkage);
+ assert("nikita-3408", !carry_node_end(doing,
+ right));
+ } while (reiser4_carry_real(right) ==
+ reiser4_carry_real(node));
+ return right;
+ }
+ }
+ read_unlock_tree();
+
+ flags = GN_CAN_USE_UPPER_LEVELS;
+ if (!(op->u.insert.flags & COPI_LOAD_RIGHT))
+ flags = GN_NO_ALLOC;
+
+ /* then, try to lock right neighbor */
+ init_lh(&lh);
+ result = reiser4_get_right_neighbor(&lh,
+ reiser4_carry_real(node),
+ ZNODE_WRITE_LOCK, flags);
+ if (result == 0) {
+ /* ok, node found and locked. */
+ right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
+ if (!IS_ERR(right)) {
+ right->node = lh.node;
+ move_lh(&right->lock_handle, &lh);
+ right->free = 1;
+ result = lock_carry_node_tail(right);
+ if (result != 0)
+ right = ERR_PTR(result);
+ }
+ } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
+ /* node is rightmost node in a tree, or neighbor wasn't in
+ cache, or there is an extent on the right. */
+ right = NULL;
+ } else
+ right = ERR_PTR(result);
+ done_lh(&lh);
+ return right;
+}
+
+/* how much free space in a @node is needed for @op
+
+ How much space in @node is required for completion of @op, where @op is
+ insert or paste operation.
+*/
+static unsigned int space_needed_for_op(znode * node /* znode data are
+ * inserted or
+ * pasted in */ ,
+ carry_op * op /* carry
+ operation */ )
+{
+ assert("nikita-919", op != NULL);
+
+ switch (op->op) {
+ default:
+ impossible("nikita-1701", "Wrong opcode");
+ case COP_INSERT:
+ return space_needed(node, NULL, op->u.insert.d->data, 1);
+ case COP_PASTE:
+ return space_needed(node, op->u.insert.d->coord,
+ op->u.insert.d->data, 0);
+ }
+}
+
+/* how much space in @node is required to insert or paste @data at
+ @coord. */
+unsigned int space_needed(const znode * node /* node data are inserted or
+ * pasted in */ ,
+ const coord_t *coord /* coord where data are
+ * inserted or pasted
+ * at */ ,
+ const reiser4_item_data * data /* data to insert or
+ * paste */ ,
+ int insertion/* non-0 is inserting, 0---paste */)
+{
+ int result;
+ item_plugin *iplug;
+
+ assert("nikita-917", node != NULL);
+ assert("nikita-918", node_plugin_by_node(node) != NULL);
+ assert("vs-230", !insertion || (coord == NULL));
+
+ result = 0;
+ iplug = data->iplug;
+ if (iplug->b.estimate != NULL) {
+ /* ask item plugin how much space is needed to insert this
+ item */
+ result += iplug->b.estimate(insertion ? NULL : coord, data);
+ } else {
+ /* reasonable default */
+ result += data->length;
+ }
+ if (insertion) {
+ node_plugin *nplug;
+
+ nplug = node->nplug;
+ /* and add node overhead */
+ if (nplug->item_overhead != NULL)
+ result += nplug->item_overhead(node, NULL);
+ }
+ return result;
+}
+
+/* find &coord in parent where pointer to new child is to be stored. */
+static int find_new_child_coord(carry_op * op /* COP_INSERT carry operation to
+ * insert pointer to new
+ * child */ )
+{
+ int result;
+ znode *node;
+ znode *child;
+
+ assert("nikita-941", op != NULL);
+ assert("nikita-942", op->op == COP_INSERT);
+
+ node = reiser4_carry_real(op->node);
+ assert("nikita-943", node != NULL);
+ assert("nikita-944", node_plugin_by_node(node) != NULL);
+
+ child = reiser4_carry_real(op->u.insert.child);
+ result =
+ find_new_child_ptr(node, child, op->u.insert.brother,
+ op->u.insert.d->coord);
+
+ build_child_ptr_data(child, op->u.insert.d->data);
+ return result;
+}
+
+/* additional amount of free space in @node required to complete @op */
+static int free_space_shortage(znode * node /* node to check */ ,
+ carry_op * op/* operation being performed */)
+{
+ assert("nikita-1061", node != NULL);
+ assert("nikita-1062", op != NULL);
+
+ switch (op->op) {
+ default:
+ impossible("nikita-1702", "Wrong opcode");
+ case COP_INSERT:
+ case COP_PASTE:
+ return space_needed_for_op(node, op) - znode_free_space(node);
+ case COP_EXTENT:
+ /* when inserting extent shift data around until insertion
+ point is utmost in the node. */
+ if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
+ return +1;
+ else
+ return -1;
+ }
+}
+
+/* helper function: update node pointer in operation after insertion
+ point was probably shifted into @target. */
+static znode *sync_op(carry_op * op, carry_node * target)
+{
+ znode *insertion_node;
+
+ /* reget node from coord: shift might move insertion coord to
+ the neighbor */
+ insertion_node = op->u.insert.d->coord->node;
+ /* if insertion point was actually moved into new node,
+ update carry node pointer in operation. */
+ if (insertion_node != reiser4_carry_real(op->node)) {
+ op->node = target;
+ assert("nikita-2540",
+ reiser4_carry_real(target) == insertion_node);
+ }
+ assert("nikita-2541",
+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
+ return insertion_node;
+}
+
+/*
+ * complete make_space() call: update tracked lock handle if necessary. See
+ * comments for fs/reiser4/carry.h:carry_track_type
+ */
+static int
+make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
+{
+ int result;
+ carry_track_type tracking;
+ znode *node;
+
+ tracking = doing->track_type;
+ node = op->u.insert.d->coord->node;
+
+ if (tracking == CARRY_TRACK_NODE ||
+ (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
+ /* inserting or pasting into node different from
+ original. Update lock handle supplied by caller. */
+ assert("nikita-1417", doing->tracked != NULL);
+ done_lh(doing->tracked);
+ init_lh(doing->tracked);
+ result = longterm_lock_znode(doing->tracked, node,
+ ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_HIPRI);
+ } else
+ result = 0;
+ return result;
+}
+
+/* This is insertion policy function. It shifts data to the left and right
+ neighbors of insertion coord and allocates new nodes until there is enough
+ free space to complete @op.
+
+ See comments in the body.
+
+ Assumes that the node format favors insertions at the right end of the node
+ as node40 does.
+
+ See carry_flow() on detail about flow insertion
+*/
+static int make_space(carry_op * op /* carry operation, insert or paste */ ,
+ carry_level * doing /* current carry queue */ ,
+ carry_level * todo/* carry queue on the parent level */)
+{
+ znode *node;
+ int result;
+ int not_enough_space;
+ int blk_alloc;
+ znode *orig_node;
+ __u32 flags;
+
+ coord_t *coord;
+
+ assert("nikita-890", op != NULL);
+ assert("nikita-891", todo != NULL);
+ assert("nikita-892",
+ op->op == COP_INSERT ||
+ op->op == COP_PASTE || op->op == COP_EXTENT);
+ assert("nikita-1607",
+ reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
+
+ flags = op->u.insert.flags;
+
+ /* NOTE check that new node can only be allocated after checking left
+ * and right neighbors. This is necessary for proper work of
+ * find_{left,right}_neighbor(). */
+ assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
+ flags & COPI_DONT_SHIFT_LEFT));
+ assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
+ flags & COPI_DONT_SHIFT_RIGHT));
+
+ coord = op->u.insert.d->coord;
+ orig_node = node = coord->node;
+
+ assert("nikita-908", node != NULL);
+ assert("nikita-909", node_plugin_by_node(node) != NULL);
+
+ result = 0;
+ /* If there is not enough space in a node, try to shift something to
+ the left neighbor. This is a bit tricky, as locking to the left is
+ low priority. This is handled by restart logic in carry().
+ */
+ not_enough_space = free_space_shortage(node, op);
+ if (not_enough_space <= 0)
+ /* it is possible that carry was called when there actually
+ was enough space in the node. For example, when inserting
+ leftmost item so that delimiting keys have to be updated.
+ */
+ return make_space_tail(op, doing, orig_node);
+ if (!(flags & COPI_DONT_SHIFT_LEFT)) {
+ carry_node *left;
+ /* make note in statistics of an attempt to move
+ something into the left neighbor */
+ left = find_left_neighbor(op, doing);
+ if (unlikely(IS_ERR(left))) {
+ if (PTR_ERR(left) == -E_REPEAT)
+ return -E_REPEAT;
+ else {
+ /* some error other than restart request
+ occurred. This shouldn't happen. Issue a
+ warning and continue as if left neighbor
+ weren't existing.
+ */
+ warning("nikita-924",
+ "Error accessing left neighbor: %li",
+ PTR_ERR(left));
+ }
+ } else if (left != NULL) {
+
+ /* shift everything possible on the left of and
+ including insertion coord into the left neighbor */
+ result = carry_shift_data(LEFT_SIDE, coord,
+ reiser4_carry_real(left),
+ doing, todo,
+ flags & COPI_GO_LEFT);
+
+ /* reget node from coord: shift_left() might move
+ insertion coord to the left neighbor */
+ node = sync_op(op, left);
+
+ not_enough_space = free_space_shortage(node, op);
+ /* There is not enough free space in @node, but
+ may be, there is enough free space in
+ @left. Various balancing decisions are valid here.
+ The same for the shifiting to the right.
+ */
+ }
+ }
+ /* If there still is not enough space, shift to the right */
+ if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
+ carry_node *right;
+
+ right = find_right_neighbor(op, doing);
+ if (IS_ERR(right)) {
+ warning("nikita-1065",
+ "Error accessing right neighbor: %li",
+ PTR_ERR(right));
+ } else if (right != NULL) {
+ /* node containing insertion point, and its right
+ neighbor node are write locked by now.
+
+ shift everything possible on the right of but
+ excluding insertion coord into the right neighbor
+ */
+ result = carry_shift_data(RIGHT_SIDE, coord,
+ reiser4_carry_real(right),
+ doing, todo,
+ 1 /* go to right neighbor
+ if there is nothing
+ to shift */);
+ /*
+ NOTE-EDWARD: If there is nothing to shift, then
+ moving insertion point to the right neighbor is
+ a must! Otherwise, tree degeneration is possible.
+ E.g. in the following scenario: suppose current node
+ and its left neighbor are full. We plug a hole with
+ one logical block at offset OFF in a file, and the
+ current position in the tree is at the end of the
+ node. Since we don't go to right, a new node will be
+ inserted with only one item in it. In the next
+ iteration we plug a hole at offset (OFF - BLOCK_SIZE).
+ Note that in that next iteration the insertion point
+ will be the same (i.e. before the new node). Thus,
+ again, we don't go to right and insert a second new
+ node with only one item in it, etc...
+ */
+ /* reget node from coord after moving the insertion
+ coord to the right neighbor */
+ node = sync_op(op, right);
+ not_enough_space = free_space_shortage(node, op);
+ }
+ }
+ /* If there is still not enough space, allocate new node(s).
+
+ We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
+ the carry operation flags (currently this is needed during flush
+ only).
+ */
+ for (blk_alloc = 0;
+ not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
+ !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
+ carry_node *fresh; /* new node we are allocating */
+ coord_t coord_shadow; /* remembered insertion point before
+ * shifting data into new node */
+ carry_node *node_shadow; /* remembered insertion node
+ * before shifting */
+ unsigned int gointo; /* whether insertion point should move
+ * into newly allocated node */
+
+ /* allocate new node on the right of @node. Znode and disk
+ fake block number for new node are allocated.
+
+ add_new_znode() posts carry operation COP_INSERT with
+ COPT_CHILD option to the parent level to add
+ pointer to newly created node to its parent.
+
+ Subtle point: if several new nodes are required to complete
+ insertion operation at this level, they will be inserted
+ into their parents in the order of creation, which means
+ that @node will be valid "cookie" at the time of insertion.
+
+ */
+ fresh = add_new_znode(node, op->node, doing, todo);
+ if (IS_ERR(fresh))
+ return PTR_ERR(fresh);
+
+ /* Try to shift into new node. */
+ result = lock_carry_node(doing, fresh);
+ zput(reiser4_carry_real(fresh));
+ if (result != 0) {
+ warning("nikita-947",
+ "Cannot lock new node: %i", result);
+ return result;
+ }
+
+ /* both nodes are write locked by now.
+
+ shift everything possible on the right of and
+ including insertion coord into the right neighbor.
+ */
+ coord_dup(&coord_shadow, op->u.insert.d->coord);
+ node_shadow = op->node;
+ /* move insertion point into newly created node if:
+
+ . insertion point is rightmost in the source node, or
+ . this is not the first node we are allocating in a row.
+ */
+ gointo =
+ (blk_alloc > 0) ||
+ coord_is_after_rightmost(op->u.insert.d->coord);
+
+ if (gointo &&
+ op->op == COP_PASTE &&
+ coord_is_existing_item(op->u.insert.d->coord) &&
+ is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
+ /* paste into solid (atomic) item, which can contain
+ only one unit, so we need to shift it right, where
+ insertion point supposed to be */
+
+ assert("edward-1444", op->u.insert.d->data->iplug ==
+ item_plugin_by_id(STATIC_STAT_DATA_ID));
+ assert("edward-1445",
+ op->u.insert.d->data->length >
+ node_plugin_by_node(coord->node)->free_space
+ (coord->node));
+
+ op->u.insert.d->coord->between = BEFORE_UNIT;
+ }
+
+ result = carry_shift_data(RIGHT_SIDE, coord,
+ reiser4_carry_real(fresh),
+ doing, todo, gointo);
+ /* if insertion point was actually moved into new node,
+ update carry node pointer in operation. */
+ node = sync_op(op, fresh);
+ not_enough_space = free_space_shortage(node, op);
+ if ((not_enough_space > 0) && (node != coord_shadow.node)) {
+ /* there is not enough free in new node. Shift
+ insertion point back to the @shadow_node so that
+ next new node would be inserted between
+ @shadow_node and @fresh.
+ */
+ coord_normalize(&coord_shadow);
+ coord_dup(coord, &coord_shadow);
+ node = coord->node;
+ op->node = node_shadow;
+ if (1 || (flags & COPI_STEP_BACK)) {
+ /* still not enough space?! Maybe there is
+ enough space in the source node (i.e., node
+ data are moved from) now.
+ */
+ not_enough_space =
+ free_space_shortage(node, op);
+ }
+ }
+ }
+ if (not_enough_space > 0) {
+ if (!(flags & COPI_DONT_ALLOCATE))
+ warning("nikita-948", "Cannot insert new item");
+ result = -E_NODE_FULL;
+ }
+ assert("nikita-1622", ergo(result == 0,
+ reiser4_carry_real(op->node) == coord->node));
+ assert("nikita-2616", coord == op->u.insert.d->coord);
+ if (result == 0)
+ result = make_space_tail(op, doing, orig_node);
+ return result;
+}
+
+/* insert_paste_common() - common part of insert and paste operations
+
+ This function performs common part of COP_INSERT and COP_PASTE.
+
+ There are two ways in which insertion/paste can be requested:
+
+ . by directly supplying reiser4_item_data. In this case, op ->
+ u.insert.type is set to COPT_ITEM_DATA.
+
+ . by supplying child pointer to which is to inserted into parent. In this
+ case op -> u.insert.type == COPT_CHILD.
+
+ . by supplying key of new item/unit. This is currently only used during
+ extent insertion
+
+ This is required, because when new node is allocated we don't know at what
+ position pointer to it is to be stored in the parent. Actually, we don't
+ even know what its parent will be, because parent can be re-balanced
+ concurrently and new node re-parented, and because parent can be full and
+ pointer to the new node will go into some other node.
+
+ insert_paste_common() resolves pointer to child node into position in the
+ parent by calling find_new_child_coord(), that fills
+ reiser4_item_data. After this, insertion/paste proceeds uniformly.
+
+ Another complication is with finding free space during pasting. It may
+ happen that while shifting items to the neighbors and newly allocated
+ nodes, insertion coord can no longer be in the item we wanted to paste
+ into. At this point, paste becomes (morphs) into insert. Moreover free
+ space analysis has to be repeated, because amount of space required for
+ insertion is different from that of paste (item header overhead, etc).
+
+ This function "unifies" different insertion modes (by resolving child
+ pointer or key into insertion coord), and then calls make_space() to free
+ enough space in the node by shifting data to the left and right and by
+ allocating new nodes if necessary. Carry operation knows amount of space
+ required for its completion. After enough free space is obtained, caller of
+ this function (carry_{insert,paste,etc.}) performs actual insertion/paste
+ by calling item plugin method.
+
+*/
+static int insert_paste_common(carry_op * op /* carry operation being
+ * performed */ ,
+ carry_level * doing /* current carry level */ ,
+ carry_level * todo /* next carry level */ ,
+ carry_insert_data * cdata /* pointer to
+ * cdata */ ,
+ coord_t *coord /* insertion/paste coord */ ,
+ reiser4_item_data * data /* data to be
+ * inserted/pasted */ )
+{
+ assert("nikita-981", op != NULL);
+ assert("nikita-980", todo != NULL);
+ assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
+ || (op->op == COP_EXTENT));
+
+ if (op->u.insert.type == COPT_PASTE_RESTARTED) {
+ /* nothing to do. Fall through to make_space(). */
+ ;
+ } else if (op->u.insert.type == COPT_KEY) {
+ node_search_result intra_node;
+ znode *node;
+ /* Problem with doing batching at the lowest level, is that
+ operations here are given by coords where modification is
+ to be performed, and one modification can invalidate coords
+ of all following operations.
+
+ So, we are implementing yet another type for operation that
+ will use (the only) "locator" stable across shifting of
+ data between nodes, etc.: key (COPT_KEY).
+
+ This clause resolves key to the coord in the node.
+
+ But node can change also. Probably some pieces have to be
+ added to the lock_carry_node(), to lock node by its key.
+
+ */
+ /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
+ if you need something else. */
+ op->u.insert.d->coord = coord;
+ node = reiser4_carry_real(op->node);
+ intra_node = node_plugin_by_node(node)->lookup
+ (node, op->u.insert.d->key, FIND_EXACT,
+ op->u.insert.d->coord);
+ if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
+ warning("nikita-1715", "Intra node lookup failure: %i",
+ intra_node);
+ return intra_node;
+ }
+ } else if (op->u.insert.type == COPT_CHILD) {
+ /* if we are asked to insert pointer to the child into
+ internal node, first convert pointer to the child into
+ coord within parent node.
+ */
+ znode *child;
+ int result;
+
+ op->u.insert.d = cdata;
+ op->u.insert.d->coord = coord;
+ op->u.insert.d->data = data;
+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
+ result = find_new_child_coord(op);
+ child = reiser4_carry_real(op->u.insert.child);
+ if (result != NS_NOT_FOUND) {
+ warning("nikita-993",
+ "Cannot find a place for child pointer: %i",
+ result);
+ return result;
+ }
+ /* This only happens when we did multiple insertions at
+ the previous level, trying to insert single item and
+ it so happened, that insertion of pointers to all new
+ nodes before this one already caused parent node to
+ split (may be several times).
+
+ I am going to come up with better solution.
+
+ You are not expected to understand this.
+ -- v6root/usr/sys/ken/slp.c
+
+ Basically, what happens here is the following: carry came
+ to the parent level and is about to insert internal item
+ pointing to the child node that it just inserted in the
+ level below. Position where internal item is to be inserted
+ was found by find_new_child_coord() above, but node of the
+ current carry operation (that is, parent node of child
+ inserted on the previous level), was determined earlier in
+ the lock_carry_level/lock_carry_node. It could so happen
+ that other carry operations already performed on the parent
+ level already split parent node, so that insertion point
+ moved into another node. Handle this by creating new carry
+ node for insertion point if necessary.
+ */
+ if (reiser4_carry_real(op->node) !=
+ op->u.insert.d->coord->node) {
+ pool_ordering direction;
+ znode *z1;
+ znode *z2;
+ reiser4_key k1;
+ reiser4_key k2;
+
+ /*
+ * determine in what direction insertion point
+ * moved. Do this by comparing delimiting keys.
+ */
+ z1 = op->u.insert.d->coord->node;
+ z2 = reiser4_carry_real(op->node);
+ if (keyle(leftmost_key_in_node(z1, &k1),
+ leftmost_key_in_node(z2, &k2)))
+ /* insertion point moved to the left */
+ direction = POOLO_BEFORE;
+ else
+ /* insertion point moved to the right */
+ direction = POOLO_AFTER;
+
+ op->node = reiser4_add_carry_skip(doing,
+ direction, op->node);
+ if (IS_ERR(op->node))
+ return PTR_ERR(op->node);
+ op->node->node = op->u.insert.d->coord->node;
+ op->node->free = 1;
+ result = lock_carry_node(doing, op->node);
+ if (result != 0)
+ return result;
+ }
+
+ /*
+ * set up key of an item being inserted: we are inserting
+ * internal item and its key is (by the very definition of
+ * search tree) is leftmost key in the child node.
+ */
+ write_lock_dk(znode_get_tree(child));
+ op->u.insert.d->key = leftmost_key_in_node(child,
+ znode_get_ld_key(child));
+ write_unlock_dk(znode_get_tree(child));
+ op->u.insert.d->data->arg = op->u.insert.brother;
+ } else {
+ assert("vs-243", op->u.insert.d->coord != NULL);
+ op->u.insert.d->coord->node = reiser4_carry_real(op->node);
+ }
+
+ /* find free space. */
+ return make_space(op, doing, todo);
+}
+
+/* handle carry COP_INSERT operation.
+
+ Insert new item into node. New item can be given in one of two ways:
+
+ - by passing &tree_coord and &reiser4_item_data as part of @op. This is
+ only applicable at the leaf/twig level.
+
+ - by passing a child node pointer to which is to be inserted by this
+ operation.
+
+*/
+static int carry_insert(carry_op * op /* operation to perform */ ,
+ carry_level * doing /* queue of operations @op
+ * is part of */ ,
+ carry_level * todo /* queue where new operations
+ * are accumulated */ )
+{
+ znode *node;
+ carry_insert_data cdata;
+ coord_t coord;
+ reiser4_item_data data;
+ carry_plugin_info info;
+ int result;
+
+ assert("nikita-1036", op != NULL);
+ assert("nikita-1037", todo != NULL);
+ assert("nikita-1038", op->op == COP_INSERT);
+
+ coord_init_zero(&coord);
+
+ /* perform common functionality of insert and paste. */
+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+ if (result != 0)
+ return result;
+
+ node = op->u.insert.d->coord->node;
+ assert("nikita-1039", node != NULL);
+ assert("nikita-1040", node_plugin_by_node(node) != NULL);
+
+ assert("nikita-949",
+ space_needed_for_op(node, op) <= znode_free_space(node));
+
+ /* ask node layout to create new item. */
+ info.doing = doing;
+ info.todo = todo;
+ result = node_plugin_by_node(node)->create_item
+ (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
+ &info);
+ doing->restartable = 0;
+ znode_make_dirty(node);
+
+ return result;
+}
+
+/*
+ * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
+ * supplied with a "flow" (that is, a stream of data) and inserts it into tree
+ * by slicing into multiple items.
+ */
+
+#define flow_insert_point(op) ((op)->u.insert_flow.insert_point)
+#define flow_insert_flow(op) ((op)->u.insert_flow.flow)
+#define flow_insert_data(op) ((op)->u.insert_flow.data)
+
+static size_t item_data_overhead(carry_op * op)
+{
+ if (flow_insert_data(op)->iplug->b.estimate == NULL)
+ return 0;
+ return (flow_insert_data(op)->iplug->b.
+ estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
+ flow_insert_data(op)->length);
+}
+
+/* FIXME-VS: this is called several times during one make_flow_for_insertion
+ and it will always return the same result. Some optimization could be made
+ by calculating this value once at the beginning and passing it around. That
+ would reduce some flexibility in future changes
+*/
+static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
+static size_t flow_insertion_overhead(carry_op * op)
+{
+ znode *node;
+ size_t insertion_overhead;
+
+ node = flow_insert_point(op)->node;
+ insertion_overhead = 0;
+ if (node->nplug->item_overhead &&
+ !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
+ flow_insert_data(op)))
+ insertion_overhead =
+ node->nplug->item_overhead(node, NULL) +
+ item_data_overhead(op);
+ return insertion_overhead;
+}
+
+/* how many bytes of flow does fit to the node */
+static int what_can_fit_into_node(carry_op * op)
+{
+ size_t free, overhead;
+
+ overhead = flow_insertion_overhead(op);
+ free = znode_free_space(flow_insert_point(op)->node);
+ if (free <= overhead)
+ return 0;
+ free -= overhead;
+ /* FIXME: flow->length is loff_t only to not get overflowed in case of
+ expandign truncate */
+ if (free < op->u.insert_flow.flow->length)
+ return free;
+ return (int)op->u.insert_flow.flow->length;
+}
+
+/* in make_space_for_flow_insertion we need to check either whether whole flow
+ fits into a node or whether minimal fraction of flow fits into a node */
+static int enough_space_for_whole_flow(carry_op * op)
+{
+ return (unsigned)what_can_fit_into_node(op) ==
+ op->u.insert_flow.flow->length;
+}
+
+#define MIN_FLOW_FRACTION 1
+static int enough_space_for_min_flow_fraction(carry_op * op)
+{
+ //assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
+
+ return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
+}
+
+/* this returns 0 if left neighbor was obtained successfully and everything
+ upto insertion point including it were shifted and left neighbor still has
+ some free space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
+{
+ carry_node *left;
+ znode *orig;
+
+ left = find_left_neighbor(op, doing);
+ if (unlikely(IS_ERR(left))) {
+ warning("vs-899",
+ "make_space_by_shift_left: "
+ "error accessing left neighbor: %li", PTR_ERR(left));
+ return 1;
+ }
+ if (left == NULL)
+ /* left neighbor either does not exist or is unformatted
+ node */
+ return 1;
+
+ orig = flow_insert_point(op)->node;
+ /* try to shift content of node @orig from its head upto insert point
+ including insertion point into the left neighbor */
+ carry_shift_data(LEFT_SIDE, flow_insert_point(op),
+ reiser4_carry_real(left), doing, todo,
+ 1/* including insert point */);
+ if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
+ /* insertion point did not move */
+ return 1;
+ }
+
+ /* insertion point is set after last item in the node */
+ assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
+
+ if (!enough_space_for_min_flow_fraction(op)) {
+ /* insertion point node does not have enough free space to put
+ even minimal portion of flow into it, therefore, move
+ insertion point back to orig node (before first item) */
+ coord_init_before_first_item(flow_insert_point(op), orig);
+ return 1;
+ }
+
+ /* part of flow is to be written to the end of node */
+ op->node = left;
+ return 0;
+}
+
+/* this returns 0 if right neighbor was obtained successfully and everything to
+ the right of insertion point was shifted to it and node got enough free
+ space to put minimal fraction of flow into it */
+static int
+make_space_by_shift_right(carry_op * op, carry_level * doing,
+ carry_level * todo)
+{
+ carry_node *right;
+
+ right = find_right_neighbor(op, doing);
+ if (unlikely(IS_ERR(right))) {
+ warning("nikita-1065", "shift_right_excluding_insert_point: "
+ "error accessing right neighbor: %li", PTR_ERR(right));
+ return 1;
+ }
+ if (right) {
+ /* shift everything possible on the right of but excluding
+ insertion coord into the right neighbor */
+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+ reiser4_carry_real(right), doing, todo,
+ 0/* not including insert point */);
+ } else {
+ /* right neighbor either does not exist or is unformatted
+ node */
+ ;
+ }
+ if (coord_is_after_rightmost(flow_insert_point(op))) {
+ if (enough_space_for_min_flow_fraction(op)) {
+ /* part of flow is to be written to the end of node */
+ return 0;
+ }
+ }
+
+ /* new node is to be added if insert point node did not get enough
+ space for whole flow */
+ return 1;
+}
+
+/* this returns 0 when insert coord is set at the node end and fraction of flow
+ fits into that node */
+static int
+make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
+{
+ int result;
+ znode *node;
+ carry_node *new;
+
+ node = flow_insert_point(op)->node;
+
+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+ return RETERR(-E_NODE_FULL);
+ /* add new node after insert point node */
+ new = add_new_znode(node, op->node, doing, todo);
+ if (unlikely(IS_ERR(new)))
+ return PTR_ERR(new);
+ result = lock_carry_node(doing, new);
+ zput(reiser4_carry_real(new));
+ if (unlikely(result))
+ return result;
+ op->u.insert_flow.new_nodes++;
+ if (!coord_is_after_rightmost(flow_insert_point(op))) {
+ carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
+ reiser4_carry_real(new), doing, todo,
+ 0/* not including insert point */);
+ assert("vs-901",
+ coord_is_after_rightmost(flow_insert_point(op)));
+
+ if (enough_space_for_min_flow_fraction(op))
+ return 0;
+ if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
+ return RETERR(-E_NODE_FULL);
+
+ /* add one more new node */
+ new = add_new_znode(node, op->node, doing, todo);
+ if (unlikely(IS_ERR(new)))
+ return PTR_ERR(new);
+ result = lock_carry_node(doing, new);
+ zput(reiser4_carry_real(new));
+ if (unlikely(result))
+ return result;
+ op->u.insert_flow.new_nodes++;
+ }
+
+ /* move insertion point to new node */
+ coord_init_before_first_item(flow_insert_point(op),
+ reiser4_carry_real(new));
+ op->node = new;
+ return 0;
+}
+
+static int
+make_space_for_flow_insertion(carry_op * op, carry_level * doing,
+ carry_level * todo)
+{
+ __u32 flags = op->u.insert_flow.flags;
+
+ if (enough_space_for_whole_flow(op)) {
+ /* whole flow fits into insert point node */
+ return 0;
+ }
+ if ((flags & COPI_SWEEP) &&
+ enough_space_for_min_flow_fraction(op))
+ /* use the rest of space in the current node */
+ return 0;
+
+ if (!(flags & COPI_DONT_SHIFT_LEFT)
+ && (make_space_by_shift_left(op, doing, todo) == 0)) {
+ /* insert point is shifted to left neighbor of original insert
+ point node and is set after last unit in that node. It has
+ enough space to fit at least minimal fraction of flow. */
+ return 0;
+ }
+
+ if (enough_space_for_whole_flow(op)) {
+ /* whole flow fits into insert point node */
+ return 0;
+ }
+
+ if (!(flags & COPI_DONT_SHIFT_RIGHT)
+ && (make_space_by_shift_right(op, doing, todo) == 0)) {
+ /* insert point is still set to the same node, but there is
+ nothing to the right of insert point. */
+ return 0;
+ }
+
+ if (enough_space_for_whole_flow(op)) {
+ /* whole flow fits into insert point node */
+ return 0;
+ }
+
+ return make_space_by_new_nodes(op, doing, todo);
+}
+
+/**
+ * Implements COP_INSERT_FLOW operation
+ */
+static int carry_insert_flow(carry_op *op,
+ carry_level *doing, carry_level *todo)
+{
+ int result;
+ flow_t *f;
+ coord_t *insert_point;
+ node_plugin *nplug;
+ carry_plugin_info info;
+ znode *orig_node;
+ lock_handle *orig_lh;
+
+ f = op->u.insert_flow.flow;
+ result = 0;
+
+ /* carry system needs this to work */
+ info.doing = doing;
+ info.todo = todo;
+
+ orig_node = flow_insert_point(op)->node;
+ orig_lh = doing->tracked;
+
+ while (f->length) {
+ result = make_space_for_flow_insertion(op, doing, todo);
+ if (result)
+ break;
+
+ insert_point = flow_insert_point(op);
+ nplug = node_plugin_by_node(insert_point->node);
+
+ /* compose item data for insertion/pasting */
+ flow_insert_data(op)->data = f->data;
+ flow_insert_data(op)->length = what_can_fit_into_node(op);
+
+ if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
+ /* insert point is set to item of file we are writing to
+ and we have to append to it */
+ assert("vs-903", insert_point->between == AFTER_UNIT);
+ nplug->change_item_size(insert_point,
+ flow_insert_data(op)->length);
+ flow_insert_data(op)->iplug->b.paste(insert_point,
+ flow_insert_data
+ (op), &info);
+ } else {
+ /* new item must be inserted */
+ pos_in_node_t new_pos;
+ flow_insert_data(op)->length += item_data_overhead(op);
+
+ /* FIXME-VS: this is because node40_create_item changes
+ insert_point for obscure reasons */
+ switch (insert_point->between) {
+ case AFTER_ITEM:
+ new_pos = insert_point->item_pos + 1;
+ break;
+ case EMPTY_NODE:
+ new_pos = 0;
+ break;
+ case BEFORE_ITEM:
+ assert("vs-905", insert_point->item_pos == 0);
+ new_pos = 0;
+ break;
+ default:
+ impossible("vs-906",
+ "carry_insert_flow: invalid coord");
+ new_pos = 0;
+ break;
+ }
+
+ nplug->create_item(insert_point, &f->key,
+ flow_insert_data(op), &info);
+ coord_set_item_pos(insert_point, new_pos);
+ }
+ coord_init_after_item_end(insert_point);
+ doing->restartable = 0;
+ znode_make_dirty(insert_point->node);
+
+ move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
+ }
+
+ if (orig_node != flow_insert_point(op)->node) {
+ /* move lock to new insert point */
+ done_lh(orig_lh);
+ init_lh(orig_lh);
+ result =
+ longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+ }
+
+ return result;
+}
+
+/**
+ * Implements COP_DELETE operation
+ *
+ * Remove pointer to @op -> u.delete.child from it's parent.
+ * This operation is called to delete internal item pointing to the
+ * child node that was removed by carry from the tree on the previous
+ * tree level. *
+ * This function also handles killing of a tree root is last pointer from it
+ * was removed. This is complicated by our handling of "twig" level: root on
+ * twig level is never killed.
+ */
+static int carry_delete(carry_op *op, carry_level *doing, carry_level *todo)
+{
+ int result;
+ coord_t coord;
+ coord_t coord2;
+ znode *parent;
+ znode *child;
+ carry_plugin_info info;
+ reiser4_tree *tree;
+
+ assert("nikita-893", op != NULL);
+ assert("nikita-894", todo != NULL);
+ assert("nikita-895", op->op == COP_DELETE);
+
+ coord_init_zero(&coord);
+ coord_init_zero(&coord2);
+
+ parent = reiser4_carry_real(op->node);
+ child = op->u.delete.child ?
+ reiser4_carry_real(op->u.delete.child) : op->node->node;
+ tree = znode_get_tree(child);
+ read_lock_tree();
+ /*
+ * @parent was determined when carry entered parent level
+ * (lock_carry_level/lock_carry_node). Since then, actual parent of
+ * @child node could change due to other carry operations performed on
+ * the parent level. Check for this.
+ */
+ if (znode_parent(child) != parent) {
+ /* NOTE-NIKITA add stat counter for this. */
+ parent = znode_parent(child);
+ assert("nikita-2581", find_carry_node(doing, parent));
+ }
+ read_unlock_tree();
+
+ assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
+ /*
+ * Twig level horrors: tree should be of height at least 2. So, last
+ * pointer from the root at twig level is preserved even if child is
+ * empty. This is ugly, but so it was architectured.
+ */
+ if (znode_is_root(parent) &&
+ znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
+ node_num_items(parent) == 1) {
+ /* Delimiting key manipulations. */
+ write_lock_dk(tree);
+ znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
+ znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
+ ZF_SET(child, JNODE_DKSET);
+ write_unlock_dk(tree);
+
+ /* @child escaped imminent death! */
+ ZF_CLR(child, JNODE_HEARD_BANSHEE);
+ return 0;
+ }
+ /*
+ * construct a coord of the pointer to the child
+ */
+ result = find_child_ptr(parent, child, &coord);
+ if (result != NS_FOUND) {
+ warning("nikita-994", "Cannot find child pointer: %i", result);
+ print_coord_content("coord", &coord);
+ return result;
+ }
+ coord_dup(&coord2, &coord);
+ info.doing = doing;
+ info.todo = todo;
+ {
+ /*
+ * Actually kill internal item: prepare structure with
+ * arguments for ->cut_and_kill() method...
+ */
+ struct carry_kill_data kdata;
+ kdata.params.from = &coord;
+ kdata.params.to = &coord2;
+ kdata.params.from_key = NULL;
+ kdata.params.to_key = NULL;
+ kdata.params.smallest_removed = NULL;
+ kdata.params.truncate = 1;
+ kdata.flags = op->u.delete.flags;
+ kdata.inode = NULL;
+ kdata.left = NULL;
+ kdata.right = NULL;
+ kdata.buf = NULL;
+ /* ... and call it. */
+ result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
+ &info);
+ }
+ doing->restartable = 0;
+ /*
+ * kill the root if needed.
+ * we don't kill roots at and lower than twig level
+ */
+ if (znode_is_root(parent) &&
+ znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
+ node_num_items(parent) == 1)
+ result = reiser4_kill_tree_root(coord.node);
+
+ return result < 0 ? result : 0;
+}
+
+/* implements COP_CUT opration
+
+ Cuts part or whole content of node.
+
+*/
+static int carry_cut(carry_op * op /* operation to be performed */ ,
+ carry_level * doing /* current carry level */ ,
+ carry_level * todo/* next carry level */)
+{
+ int result;
+ carry_plugin_info info;
+ node_plugin *nplug;
+
+ assert("nikita-896", op != NULL);
+ assert("nikita-897", todo != NULL);
+ assert("nikita-898", op->op == COP_CUT);
+
+ info.doing = doing;
+ info.todo = todo;
+
+ nplug = node_plugin_by_node(reiser4_carry_real(op->node));
+ if (op->u.cut_or_kill.is_cut)
+ result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
+ else
+ result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
+
+ doing->restartable = 0;
+ return result < 0 ? result : 0;
+}
+
+/**
+ * Helper function for carry_paste(): returns true if @op can be
+ * continued as paste
+ */
+static int can_paste(coord_t *icoord, const reiser4_key *key,
+ const reiser4_item_data *data)
+{
+ coord_t circa;
+ item_plugin *new_iplug;
+ item_plugin *old_iplug;
+ int result = 0; /* to keep gcc shut */
+
+ assert("", icoord->between != AT_UNIT);
+
+ /* obviously, one cannot paste when node is empty---there is nothing
+ to paste into. */
+ if (node_is_empty(icoord->node))
+ return 0;
+ /* if insertion point is at the middle of the item, then paste */
+ if (!coord_is_between_items(icoord))
+ return 1;
+ coord_dup(&circa, icoord);
+ circa.between = AT_UNIT;
+
+ old_iplug = item_plugin_by_coord(&circa);
+ new_iplug = data->iplug;
+
+ /* check whether we can paste to the item @icoord is "at" when we
+ ignore ->between field */
+ if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data))
+ result = 1;
+ else if (icoord->between == BEFORE_UNIT
+ || icoord->between == BEFORE_ITEM) {
+ /* otherwise, try to glue to the item at the left, if any */
+ coord_dup(&circa, icoord);
+ if (coord_set_to_left(&circa)) {
+ result = 0;
+ coord_init_before_item(icoord);
+ } else {
+ old_iplug = item_plugin_by_coord(&circa);
+ result = (old_iplug == new_iplug)
+ && item_can_contain_key(icoord, key, data);
+ if (result) {
+ coord_dup(icoord, &circa);
+ icoord->between = AFTER_UNIT;
+ }
+ }
+ } else if (icoord->between == AFTER_UNIT
+ || icoord->between == AFTER_ITEM) {
+ coord_dup(&circa, icoord);
+ /* otherwise, try to glue to the item at the right, if any */
+ if (coord_set_to_right(&circa)) {
+ result = 0;
+ coord_init_after_item(icoord);
+ } else {
+ int (*cck) (const coord_t *, const reiser4_key *,
+ const reiser4_item_data *);
+
+ old_iplug = item_plugin_by_coord(&circa);
+
+ cck = old_iplug->b.can_contain_key;
+ if (cck == NULL)
+ /* item doesn't define ->can_contain_key
+ method? So it is not expandable. */
+ result = 0;
+ else {
+ result = (old_iplug == new_iplug)
+ && cck(&circa /*icoord */ , key, data);
+ if (result) {
+ coord_dup(icoord, &circa);
+ icoord->between = BEFORE_UNIT;
+ }
+ }
+ }
+ } else
+ impossible("nikita-2513", "Nothing works");
+ if (result) {
+ if (icoord->between == BEFORE_ITEM) {
+ assert("vs-912", icoord->unit_pos == 0);
+ icoord->between = BEFORE_UNIT;
+ } else if (icoord->between == AFTER_ITEM) {
+ coord_init_after_item_end(icoord);
+ }
+ }
+ return result;
+}
+
+/* implements COP_PASTE operation
+
+ Paste data into existing item. This is complicated by the fact that after
+ we shifted something to the left or right neighbors trying to free some
+ space, item we were supposed to paste into can be in different node than
+ insertion coord. If so, we are no longer doing paste, but insert. See
+ comments in insert_paste_common().
+
+*/
+static int carry_paste(carry_op * op /* operation to be performed */ ,
+ carry_level * doing UNUSED_ARG /* current carry
+ * level */ ,
+ carry_level * todo/* next carry level */)
+{
+ znode *node;
+ carry_insert_data cdata;
+ coord_t dcoord;
+ reiser4_item_data data;
+ int result;
+ int real_size;
+ item_plugin *iplug;
+ carry_plugin_info info;
+ coord_t *coord;
+
+ assert("nikita-982", op != NULL);
+ assert("nikita-983", todo != NULL);
+ assert("nikita-984", op->op == COP_PASTE);
+
+ coord_init_zero(&dcoord);
+
+ result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
+ if (result != 0)
+ return result;
+
+ coord = op->u.insert.d->coord;
+
+ /* handle case when op -> u.insert.coord doesn't point to the item
+ of required type. restart as insert. */
+ if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
+ op->op = COP_INSERT;
+ op->u.insert.type = COPT_PASTE_RESTARTED;
+ result = op_dispatch_table[COP_INSERT].cop_handler(op,
+ doing, todo);
+
+ return result;
+ }
+
+ node = coord->node;
+ iplug = item_plugin_by_coord(coord);
+ assert("nikita-992", iplug != NULL);
+
+ assert("nikita-985", node != NULL);
+ assert("nikita-986", node_plugin_by_node(node) != NULL);
+
+ assert("nikita-987",
+ space_needed_for_op(node, op) <= znode_free_space(node));
+
+ assert("nikita-1286", coord_is_existing_item(coord));
+
+ /*
+ * if item is expanded as a result of this operation, we should first
+ * change item size, than call ->b.paste item method. If item is
+ * shrunk, it should be done other way around: first call ->b.paste
+ * method, then reduce item size.
+ */
+
+ real_size = space_needed_for_op(node, op);
+ if (real_size > 0)
+ node->nplug->change_item_size(coord, real_size);
+
+ doing->restartable = 0;
+ info.doing = doing;
+ info.todo = todo;
+
+ result = iplug->b.paste(coord, op->u.insert.d->data, &info);
+
+ if (real_size < 0)
+ node->nplug->change_item_size(coord, real_size);
+
+ /* if we pasted at the beginning of the item, update item's key. */
+ if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
+ node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
+
+ znode_make_dirty(node);
+ return result;
+}
+
+/* handle carry COP_EXTENT operation. */
+static int carry_extent(carry_op * op /* operation to perform */ ,
+ carry_level * doing /* queue of operations @op
+ * is part of */ ,
+ carry_level * todo /* queue where new operations
+ * are accumulated */ )
+{
+ znode *node;
+ carry_insert_data cdata;
+ coord_t coord;
+ reiser4_item_data data;
+ carry_op *delete_dummy;
+ carry_op *insert_extent;
+ int result;
+ carry_plugin_info info;
+
+ assert("nikita-1751", op != NULL);
+ assert("nikita-1752", todo != NULL);
+ assert("nikita-1753", op->op == COP_EXTENT);
+
+ /* extent insertion overview:
+
+ extents live on the TWIG LEVEL, which is level one above the leaf
+ one. This complicates extent insertion logic somewhat: it may
+ happen (and going to happen all the time) that in logical key
+ ordering extent has to be placed between items I1 and I2, located
+ at the leaf level, but I1 and I2 are in the same formatted leaf
+ node N1. To insert extent one has to
+
+ (1) reach node N1 and shift data between N1, its neighbors and
+ possibly newly allocated nodes until I1 and I2 fall into different
+ nodes. Since I1 and I2 are still neighboring items in logical key
+ order, they will be necessary utmost items in their respective
+ nodes.
+
+ (2) After this new extent item is inserted into node on the twig
+ level.
+
+ Fortunately this process can reuse almost all code from standard
+ insertion procedure (viz. make_space() and insert_paste_common()),
+ due to the following observation: make_space() only shifts data up
+ to and excluding or including insertion point. It never
+ "over-moves" through insertion point. Thus, one can use
+ make_space() to perform step (1). All required for this is just to
+ instruct free_space_shortage() to keep make_space() shifting data
+ until insertion point is at the node border.
+
+ */
+
+ /* perform common functionality of insert and paste. */
+ result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
+ if (result != 0)
+ return result;
+
+ node = op->u.extent.d->coord->node;
+ assert("nikita-1754", node != NULL);
+ assert("nikita-1755", node_plugin_by_node(node) != NULL);
+ assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
+
+ /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
+ extent fits between items. */
+
+ info.doing = doing;
+ info.todo = todo;
+
+ /* there is another complication due to placement of extents on the
+ twig level: extents are "rigid" in the sense that key-range
+ occupied by extent cannot grow indefinitely to the right as it is
+ for the formatted leaf nodes. Because of this when search finds two
+ adjacent extents on the twig level, it has to "drill" to the leaf
+ level, creating new node. Here we are removing this node.
+ */
+ if (node_is_empty(node)) {
+ delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
+ if (IS_ERR(delete_dummy))
+ return PTR_ERR(delete_dummy);
+ delete_dummy->u.delete.child = NULL;
+ delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
+ ZF_SET(node, JNODE_HEARD_BANSHEE);
+ }
+
+ /* proceed with inserting extent item into parent. We are definitely
+ inserting rather than pasting if we get that far. */
+ insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
+ if (IS_ERR(insert_extent))
+ /* @delete_dummy will be automatically destroyed on the level
+ exiting */
+ return PTR_ERR(insert_extent);
+ /* NOTE-NIKITA insertion by key is simplest option here. Another
+ possibility is to insert on the left or right of already existing
+ item.
+ */
+ insert_extent->u.insert.type = COPT_KEY;
+ insert_extent->u.insert.d = op->u.extent.d;
+ assert("nikita-1719", op->u.extent.d->key != NULL);
+ insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
+ insert_extent->u.insert.flags =
+ znode_get_tree(node)->carry.new_extent_flags;
+
+ /*
+ * if carry was asked to track lock handle we should actually track
+ * lock handle on the twig node rather than on the leaf where
+ * operation was started from. Transfer tracked lock handle.
+ */
+ if (doing->track_type) {
+ assert("nikita-3242", doing->tracked != NULL);
+ assert("nikita-3244", todo->tracked == NULL);
+ todo->tracked = doing->tracked;
+ todo->track_type = CARRY_TRACK_NODE;
+ doing->tracked = NULL;
+ doing->track_type = 0;
+ }
+
+ return 0;
+}
+
+/* update key in @parent between pointers to @left and @right.
+
+ Find coords of @left and @right and update delimiting key between them.
+ This is helper function called by carry_update(). Finds position of
+ internal item involved. Updates item key. Updates delimiting keys of child
+ nodes involved.
+*/
+static int update_delimiting_key(znode * parent /* node key is updated
+ * in */ ,
+ znode * left /* child of @parent */ ,
+ znode * right /* child of @parent */ ,
+ carry_level * doing /* current carry
+ * level */ ,
+ carry_level * todo /* parent carry
+ * level */ ,
+ const char **error_msg /* place to
+ * store error
+ * message */ )
+{
+ coord_t left_pos;
+ coord_t right_pos;
+ int result;
+ reiser4_key ldkey;
+ carry_plugin_info info;
+
+ assert("nikita-1177", right != NULL);
+ /* find position of right left child in a parent */
+ result = find_child_ptr(parent, right, &right_pos);
+ if (result != NS_FOUND) {
+ *error_msg = "Cannot find position of right child";
+ return result;
+ }
+
+ if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
+ /* find position of the left child in a parent */
+ result = find_child_ptr(parent, left, &left_pos);
+ if (result != NS_FOUND) {
+ *error_msg = "Cannot find position of left child";
+ return result;
+ }
+ assert("nikita-1355", left_pos.node != NULL);
+ } else
+ left_pos.node = NULL;
+
+ /* check that they are separated by exactly one key and are basically
+ sane */
+ if (REISER4_DEBUG) {
+ if ((left_pos.node != NULL)
+ && !coord_is_existing_unit(&left_pos)) {
+ *error_msg = "Left child is bastard";
+ return RETERR(-EIO);
+ }
+ if (!coord_is_existing_unit(&right_pos)) {
+ *error_msg = "Right child is bastard";
+ return RETERR(-EIO);
+ }
+ if (left_pos.node != NULL &&
+ !coord_are_neighbors(&left_pos, &right_pos)) {
+ *error_msg = "Children are not direct siblings";
+ return RETERR(-EIO);
+ }
+ }
+ *error_msg = NULL;
+
+ info.doing = doing;
+ info.todo = todo;
+
+ /*
+ * If child node is not empty, new key of internal item is a key of
+ * leftmost item in the child node. If the child is empty, take its
+ * right delimiting key as a new key of the internal item. Precise key
+ * in the latter case is not important per se, because the child (and
+ * the internal item) are going to be killed shortly anyway, but we
+ * have to preserve correct order of keys in the parent node.
+ */
+
+ if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
+ leftmost_key_in_node(right, &ldkey);
+ else {
+ read_lock_dk(znode_get_tree(parent));
+ ldkey = *znode_get_rd_key(right);
+ read_unlock_dk(znode_get_tree(parent));
+ }
+ node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
+ doing->restartable = 0;
+ znode_make_dirty(parent);
+ return 0;
+}
+
+/* implements COP_UPDATE opration
+
+ Update delimiting keys.
+
+*/
+static int carry_update(carry_op * op /* operation to be performed */ ,
+ carry_level * doing /* current carry level */ ,
+ carry_level * todo/* next carry level */)
+{
+ int result;
+ carry_node *missing UNUSED_ARG;
+ znode *left;
+ znode *right;
+ carry_node *lchild;
+ carry_node *rchild;
+ const char *error_msg;
+ reiser4_tree *tree;
+
+ /*
+ * This operation is called to update key of internal item. This is
+ * necessary when carry shifted of cut data on the child
+ * level. Arguments of this operation are:
+ *
+ * @right --- child node. Operation should update key of internal
+ * item pointing to @right.
+ *
+ * @left --- left neighbor of @right. This parameter is optional.
+ */
+
+ assert("nikita-902", op != NULL);
+ assert("nikita-903", todo != NULL);
+ assert("nikita-904", op->op == COP_UPDATE);
+
+ lchild = op->u.update.left;
+ rchild = op->node;
+
+ if (lchild != NULL) {
+ assert("nikita-1001", lchild->parent);
+ assert("nikita-1003", !lchild->left);
+ left = reiser4_carry_real(lchild);
+ } else
+ left = NULL;
+
+ tree = znode_get_tree(rchild->node);
+ read_lock_tree();
+ right = znode_parent(rchild->node);
+ read_unlock_tree();
+
+ if (right != NULL) {
+ result = update_delimiting_key(right,
+ lchild ? lchild->node : NULL,
+ rchild->node,
+ doing, todo, &error_msg);
+ } else {
+ error_msg = "Cannot find node to update key in";
+ result = RETERR(-EIO);
+ }
+ /* operation will be reposted to the next level by the
+ ->update_item_key() method of node plugin, if necessary. */
+
+ if (result != 0) {
+ warning("nikita-999", "Error updating delimiting key: %s (%i)",
+ error_msg ? : "", result);
+ }
+ return result;
+}
+
+/* move items from @node during carry */
+static int carry_shift_data(sideof side /* in what direction to move data */ ,
+ coord_t *insert_coord /* coord where new item
+ * is to be inserted */,
+ znode * node /* node which data are moved from */ ,
+ carry_level * doing /* active carry queue */ ,
+ carry_level * todo /* carry queue where new
+ * operations are to be put
+ * in */ ,
+ unsigned int including_insert_coord_p
+ /* true if @insertion_coord can be moved */ )
+{
+ int result;
+ znode *source;
+ carry_plugin_info info;
+ node_plugin *nplug;
+
+ source = insert_coord->node;
+
+ info.doing = doing;
+ info.todo = todo;
+
+ nplug = node_plugin_by_node(node);
+ result = nplug->shift(insert_coord, node,
+ (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
+ (int)including_insert_coord_p, &info);
+ /* the only error ->shift() method of node plugin can return is
+ -ENOMEM due to carry node/operation allocation. */
+ assert("nikita-915", result >= 0 || result == -ENOMEM);
+ if (result > 0) {
+ /*
+ * if some number of bytes was actually shifted, mark nodes
+ * dirty, and carry level as non-restartable.
+ */
+ doing->restartable = 0;
+ znode_make_dirty(source);
+ znode_make_dirty(node);
+ }
+
+ assert("nikita-2077", coord_check(insert_coord));
+ return 0;
+}
+
+typedef carry_node *(*carry_iterator) (carry_node * node);
+static carry_node *find_dir_carry(carry_node * node, carry_level * level,
+ carry_iterator iterator);
+
+static carry_node *pool_level_list_prev(carry_node *node)
+{
+ return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
+}
+
+/* look for the left neighbor of given carry node in a carry queue.
+
+ This is used by find_left_neighbor(), but I am not sure that this
+ really gives any advantage. More statistics required.
+
+*/
+carry_node *find_left_carry(carry_node * node /* node to find left neighbor
+ * of */ ,
+ carry_level * level/* level to scan */)
+{
+ return find_dir_carry(node, level,
+ (carry_iterator) pool_level_list_prev);
+}
+
+static carry_node *pool_level_list_next(carry_node *node)
+{
+ return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
+}
+
+/* look for the right neighbor of given carry node in a
+ carry queue.
+
+ This is used by find_right_neighbor(), but I am not sure that this
+ really gives any advantage. More statistics required.
+
+*/
+carry_node *find_right_carry(carry_node * node /* node to find right neighbor
+ * of */ ,
+ carry_level * level/* level to scan */)
+{
+ return find_dir_carry(node, level,
+ (carry_iterator) pool_level_list_next);
+}
+
+/* look for the left or right neighbor of given carry node in a carry
+ queue.
+
+ Helper function used by find_{left|right}_carry().
+*/
+static carry_node *find_dir_carry(carry_node * node /* node to start
+ * scanning from */ ,
+ carry_level * level /* level to scan */ ,
+ carry_iterator iterator /* operation to
+ * move to the
+ * next node */)
+{
+ carry_node *neighbor;
+
+ assert("nikita-1059", node != NULL);
+ assert("nikita-1060", level != NULL);
+
+ /* scan list of carry nodes on this list dir-ward, skipping all
+ carry nodes referencing the same znode. */
+ neighbor = node;
+ while (1) {
+ neighbor = iterator(neighbor);
+ if (carry_node_end(level, neighbor))
+ /* list head is reached */
+ return NULL;
+ if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
+ return neighbor;
+ }
+}
+
+/*
+ * Memory reservation estimation.
+ *
+ * Carry process proceeds through tree levels upwards. Carry assumes that it
+ * takes tree in consistent state (e.g., that search tree invariants hold),
+ * and leaves tree consistent after it finishes. This means that when some
+ * error occurs carry cannot simply return if there are pending carry
+ * operations. Generic solution for this problem is carry-undo either as
+ * transaction manager feature (requiring checkpoints and isolation), or
+ * through some carry specific mechanism.
+ *
+ * Our current approach is to panic if carry hits an error while tree is
+ * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
+ * this "memory reservation" mechanism was added.
+ *
+ * Memory reservation is implemented by perthread-pages.diff patch from
+ * core-patches. Its API is defined in <linux/gfp.h>
+ *
+ * int perthread_pages_reserve(int nrpages, gfp_t gfp);
+ * void perthread_pages_release(int nrpages);
+ * int perthread_pages_count(void);
+ *
+ * carry estimates its worst case memory requirements at the entry, reserved
+ * enough memory, and released unused pages before returning.
+ *
+ * Code below estimates worst case memory requirements for a given carry
+ * queue. This is dome by summing worst case memory requirements for each
+ * operation in the queue.
+ *
+ */
+
+/*
+ * Memory memory requirements of many operations depends on the tree
+ * height. For example, item insertion requires new node to be inserted at
+ * each tree level in the worst case. What tree height should be used for
+ * estimation? Current tree height is wrong, because tree height can change
+ * between the time when estimation was done and the time when operation is
+ * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
+ * is also not desirable, because it would lead to the huge over-estimation
+ * all the time. Plausible solution is "capped tree height": if current tree
+ * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
+ * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
+ * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
+ * to be increased even more during short interval of time.
+ */
+#define TREE_HEIGHT_CAP (5)
+
+/* return capped tree height for the @tree. See comment above. */
+static int cap_tree_height(reiser4_tree * tree)
+{
+ return max_t(int, tree->height, TREE_HEIGHT_CAP);
+}
+
+/* return capped tree height for the current tree. */
+static int capped_height(reiser4_tree *tree)
+{
+ return cap_tree_height(tree);
+}
+
+/* return number of pages required to store given number of bytes */
+static int bytes_to_pages(int bytes)
+{
+ return (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+}
+
+/* how many pages are required to allocate znodes during item insertion. */
+static int carry_estimate_znodes(reiser4_tree *tree)
+{
+ /*
+ * Note, that there we have some problem here: there is no way to
+ * reserve pages specifically for the given slab. This means that
+ * these pages can be hijacked for some other end.
+ */
+
+ /* in the worst case we need 3 new znode on each tree level */
+ return bytes_to_pages(capped_height(tree) * sizeof(znode) * 3);
+}
+
+/*
+ * how many pages are required to load bitmaps. One bitmap per level.
+ */
+static int carry_estimate_bitmaps(reiser4_tree *tree)
+{
+ if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
+ int bytes;
+
+ bytes = capped_height(tree) * (0 + /* bnode should be added, but
+ * it is private to bitmap.c,
+ * skip for now. */
+ 2 * sizeof(jnode));
+ /* working and commit jnodes */
+ return bytes_to_pages(bytes) + 2; /* and their contents */
+ } else
+ /* bitmaps were pre-loaded during mount */
+ return 0;
+}
+
+/* worst case item insertion memory requirements */
+static int carry_estimate_insert(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ return carry_estimate_bitmaps(tree)
+ + carry_estimate_znodes(tree)
+ + 1 /* new atom */
+ + capped_height(tree) /* new block on each level */
+ + 1 /* possibly extra new block at the leaf level */
+ + 3; /* loading of leaves into memory */
+}
+
+/* worst case item deletion memory requirements */
+static int carry_estimate_delete(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ return carry_estimate_bitmaps(tree)
+ + carry_estimate_znodes(tree)
+ + 1 /* new atom */
+ + 3; /* loading of leaves into memory */
+}
+
+/* worst case tree cut memory requirements */
+static int carry_estimate_cut(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ return carry_estimate_bitmaps(tree)
+ + carry_estimate_znodes(tree)
+ + 1 /* new atom */
+ + 3;/* loading of leaves into memory */
+}
+
+/* worst case memory requirements of pasting into item */
+static int carry_estimate_paste(carry_op * op, carry_level * level,
+ reiser4_tree *tree)
+{
+ return carry_estimate_bitmaps(tree)
+ + carry_estimate_znodes(tree)
+ + 1 /* new atom */
+ + capped_height(tree) /* new block on each level */
+ + 1 /* possibly extra new block at the leaf level */
+ + 3; /* loading of leaves into memory */
+}
+
+/* worst case memory requirements of extent insertion */
+static int carry_estimate_extent(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ return carry_estimate_insert(op, level, tree) /* insert extent */
+ + carry_estimate_delete(op, level, tree); /* kill leaf */
+}
+
+/* worst case memory requirements of key update */
+static int carry_estimate_update(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ return 0;
+}
+
+/* worst case memory requirements of flow insertion */
+static int carry_estimate_insert_flow(carry_op *op, carry_level *level,
+ reiser4_tree *tree)
+{
+ int newnodes;
+
+ newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
+ CARRY_FLOW_NEW_NODES_LIMIT);
+ /*
+ * roughly estimate insert_flow as a sequence of insertions.
+ */
+ return newnodes * carry_estimate_insert(op, level, tree);
+}
+
+/* This is dispatch table for carry operations. It can be trivially
+ abstracted into useful plugin: tunable balancing policy is a good
+ thing. */
+carry_op_handler op_dispatch_table[COP_LAST_OP] = {
+ [COP_INSERT] = {
+ .cop_handler = carry_insert,
+ .cop_estimate = carry_estimate_insert}
+ ,
+ [COP_DELETE] = {
+ .cop_handler = carry_delete,
+ .cop_estimate = carry_estimate_delete}
+ ,
+ [COP_CUT] = {
+ .cop_handler = carry_cut,
+ .cop_estimate = carry_estimate_cut}
+ ,
+ [COP_PASTE] = {
+ .cop_handler = carry_paste,
+ .cop_estimate = carry_estimate_paste}
+ ,
+ [COP_EXTENT] = {
+ .cop_handler = carry_extent,
+ .cop_estimate = carry_estimate_extent}
+ ,
+ [COP_UPDATE] = {
+ .cop_handler = carry_update,
+ .cop_estimate = carry_estimate_update}
+ ,
+ [COP_INSERT_FLOW] = {
+ .cop_handler = carry_insert_flow,
+ .cop_estimate = carry_estimate_insert_flow}
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/carry_ops.h linux-5.10.2/fs/reiser4/carry_ops.h
--- linux-5.10.2.orig/fs/reiser4/carry_ops.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/carry_ops.h 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,45 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* implementation of carry operations. See carry_ops.c for details. */
+
+#if !defined(__CARRY_OPS_H__)
+#define __CARRY_OPS_H__
+
+#include "forward.h"
+#include "znode.h"
+#include "carry.h"
+
+/* carry operation handlers */
+typedef struct carry_op_handler {
+ /* perform operation */
+ int (*cop_handler)(carry_op *op, carry_level *doing,
+ carry_level *todo);
+ /* estimate memory requirements for @op */
+ int (*cop_estimate)(carry_op *op, carry_level *level,
+ reiser4_tree *tree);
+} carry_op_handler;
+
+/* This is dispatch table for carry operations. It can be trivially
+ abstracted into useful plugin: tunable balancing policy is a good
+ thing. */
+extern carry_op_handler op_dispatch_table[COP_LAST_OP];
+
+unsigned int space_needed(const znode * node, const coord_t *coord,
+ const reiser4_item_data * data, int inserting);
+extern carry_node *find_left_carry(carry_node * node, carry_level * level);
+extern carry_node *find_right_carry(carry_node * node, carry_level * level);
+
+/* __CARRY_OPS_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/checksum.c linux-5.10.2/fs/reiser4/checksum.c
--- linux-5.10.2.orig/fs/reiser4/checksum.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/checksum.c 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,33 @@
+#include <linux/err.h>
+#include "debug.h"
+#include "checksum.h"
+
+int reiser4_init_csum_tfm(struct crypto_shash **tfm)
+{
+ struct crypto_shash *new_tfm;
+
+ new_tfm = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(new_tfm)) {
+ warning("intelfx-81", "Could not load crc32c driver");
+ return PTR_ERR(new_tfm);
+ }
+
+ *tfm = new_tfm;
+ return 0;
+}
+
+void reiser4_done_csum_tfm(struct crypto_shash *tfm)
+{
+ crypto_free_shash(tfm);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/checksum.h linux-5.10.2/fs/reiser4/checksum.h
--- linux-5.10.2.orig/fs/reiser4/checksum.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/checksum.h 2020-12-23 16:07:46.114813070 +0100
@@ -0,0 +1,38 @@
+#ifndef __CHECKSUM__
+#define __CHECKSUM__
+
+#include <crypto/hash.h>
+
+int reiser4_init_csum_tfm(struct crypto_shash **tfm);
+void reiser4_done_csum_tfm(struct crypto_shash *tfm);
+u32 static inline reiser4_crc32c(struct crypto_shash *tfm,
+ u32 crc, const void *address,
+ unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[4];
+ } desc;
+ int err;
+
+ desc.shash.tfm = tfm;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+ return *(u32 *)desc.ctx;
+}
+
+#endif /* __CHECKSUM__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
+
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/context.c linux-5.10.2/fs/reiser4/context.c
--- linux-5.10.2.orig/fs/reiser4/context.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/context.c 2020-12-23 16:08:55.164816614 +0100
@@ -0,0 +1,403 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Manipulation of reiser4_context */
+
+/*
+ * global context used during system call. Variable of this type is allocated
+ * on the stack at the beginning of the reiser4 part of the system call and
+ * pointer to it is stored in the current->fs_context. This allows us to avoid
+ * passing pointer to current transaction and current lockstack (both in
+ * one-to-one mapping with threads) all over the call chain.
+ *
+ * It's kind of like those global variables the prof used to tell you not to
+ * use in CS1, except thread specific.;-) Nikita, this was a good idea.
+ *
+ * In some situations it is desirable to have ability to enter reiser4_context
+ * more than once for the same thread (nested contexts). For example, there
+ * are some functions that can be called either directly from VFS/VM or from
+ * already active reiser4 context (->writepage, for example).
+ *
+ * In such situations "child" context acts like dummy: all activity is
+ * actually performed in the top level context, and get_current_context()
+ * always returns top level context.
+ * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
+ * nested any way.
+ * Note that there is an important difference between reiser4 uses
+ * ->fs_context and the way other file systems use it. Other file systems
+ * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
+ * (this is why ->fs_context was initially called ->journal_info). This means,
+ * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
+ * to the file system, they assume that some transaction is already underway,
+ * and usually bail out, because starting nested transaction would most likely
+ * lead to the deadlock. This gives false positives with reiser4, because we
+ * set ->fs_context before starting transaction.
+ */
+
+#include "debug.h"
+#include "super.h"
+#include "context.h"
+#include "vfs_ops.h" /* for reiser4_throttle_write() */
+#include "plugin/volume/volume.h" /* for METADATA_SUBVOL_ID */
+
+#include <linux/writeback.h> /* for current_is_pdflush() */
+#include <linux/hardirq.h>
+
+/************************ context brick info ************************/
+
+static struct kmem_cache *cbi_slab = NULL;
+
+int ctx_brick_info_init_static(void)
+{
+ assert("edward-1978", cbi_slab == NULL);
+
+ cbi_slab = kmem_cache_create("ctx_brick_info",
+ sizeof(struct ctx_brick_info),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT,
+ NULL);
+ if (cbi_slab == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+void ctx_brick_info_done_static(void)
+{
+ destroy_reiser4_cache(&cbi_slab);
+}
+
+struct ctx_brick_info *alloc_context_brick_info(void)
+{
+ return kmem_cache_alloc(cbi_slab, reiser4_ctx_gfp_mask_get());
+}
+
+void free_context_brick_info(struct ctx_brick_info *cbi)
+{
+ assert("edward-1979", cbi != NULL);
+
+ kmem_cache_free(cbi_slab, cbi);
+}
+
+struct ctx_brick_info *find_context_brick_info(reiser4_context *ctx,
+ u32 brick_id)
+{
+ struct rb_root *root = &ctx->bricks_info;
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct ctx_brick_info *cbi =
+ rb_entry(node, struct ctx_brick_info, node);
+
+ if (cbi->brick_id > brick_id)
+ node = node->rb_left;
+ else if (cbi->brick_id < brick_id)
+ node = node->rb_right;
+ else
+ return cbi;
+ }
+ return NULL;
+}
+
+int insert_context_brick_info(reiser4_context *ctx,
+ struct ctx_brick_info *this)
+{
+ struct rb_root *root = &ctx->bricks_info;
+ struct rb_node *parent = NULL;
+ struct rb_node **new = &(root->rb_node);
+
+ while (*new) {
+ struct ctx_brick_info *cbi;
+
+ cbi = rb_entry(*new, struct ctx_brick_info, node);
+ parent = *new;
+
+ if (this->brick_id < cbi->brick_id)
+ new = &((*new)->rb_left);
+ else if (this->brick_id > cbi->brick_id)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+ rb_link_node(&this->node, parent, new);
+ rb_insert_color(&this->node, root);
+
+ return 0;
+}
+
+static void done_bricks_info(reiser4_context *ctx)
+{
+ struct rb_root *root;
+
+ root = &ctx->bricks_info;
+
+ /*
+ * remove pre-allocated info
+ */
+ rb_erase(&ctx->mcbi.node, root);
+ RB_CLEAR_NODE(&ctx->mcbi.node);
+
+ while (!RB_EMPTY_ROOT(root)) {
+ struct rb_node *node;
+ struct ctx_brick_info *cbi;
+
+ node = rb_first(root);
+ cbi = rb_entry(node, struct ctx_brick_info, node);
+
+ assert("edward-1980", cbi->grabbed_blocks == 0);
+
+ rb_erase(&cbi->node, root);
+ RB_CLEAR_NODE(&cbi->node);
+ free_context_brick_info(cbi);
+ }
+}
+
+static void _reiser4_init_context(reiser4_context *context,
+ struct super_block *super)
+{
+ context->super = super;
+ context->magic = context_magic;
+ context->outer = current->journal_info;
+ current->journal_info = (void *)context;
+ context->nr_children = 0;
+ context->gfp_mask = GFP_KERNEL;
+ /*
+ * init set of per-brick info and populate it
+ * with pree-allocated item for meta-data brick
+ */
+ context->bricks_info = RB_ROOT;
+ init_context_brick_info(&context->mcbi, METADATA_SUBVOL_ID);
+ insert_context_brick_info(context, &context->mcbi);
+
+ init_lock_stack(&context->stack);
+
+ reiser4_txn_begin(context);
+
+ /* initialize head of tap list */
+ INIT_LIST_HEAD(&context->taps);
+#if REISER4_DEBUG
+ context->task = current;
+#endif
+ grab_space_enable();
+}
+
+/**
+ * initialize context and bind it to the current thread
+ * This function should be called at the beginning of reiser4 part of syscall.
+ */
+reiser4_context *reiser4_init_context(struct super_block *super)
+{
+ reiser4_context *context;
+
+ assert("nikita-2662", !in_interrupt() && !in_irq());
+ assert("nikita-3357", super != NULL);
+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
+
+ context = get_current_context_check();
+ if (context && context->super == super) {
+ context = (reiser4_context *) current->journal_info;
+ context->nr_children++;
+ return context;
+ }
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (context == NULL)
+ return ERR_PTR(RETERR(-ENOMEM));
+ _reiser4_init_context(context, super);
+ return context;
+}
+
+/**
+ * This is used in scan_mgr which is called with spinlock held and in
+ * reiser4_fill_super magic.
+ */
+void init_stack_context(reiser4_context *context, struct super_block *super)
+{
+ assert("nikita-2662", !in_interrupt() && !in_irq());
+ assert("nikita-3357", super != NULL);
+ assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
+ assert("vs-12", !is_in_reiser4_context());
+
+ memset(context, 0, sizeof(*context));
+ context->on_stack = 1;
+ _reiser4_init_context(context, super);
+}
+
+/* cast lock stack embedded into reiser4 context up to its container */
+reiser4_context *get_context_by_lock_stack(lock_stack * owner)
+{
+ return container_of(owner, reiser4_context, stack);
+}
+
+/* true if there is already _any_ reiser4 context for the current thread */
+int is_in_reiser4_context(void)
+{
+ reiser4_context *ctx;
+
+ ctx = current->journal_info;
+ return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
+}
+
+/*
+ * call balance dirty pages for the current context.
+ *
+ * File system is expected to call balance_dirty_pages_ratelimited() whenever
+ * it dirties a page. reiser4 does this for unformatted nodes (that is, during
+ * write---this covers vast majority of all dirty traffic), but we cannot do
+ * this immediately when formatted node is dirtied, because long term lock is
+ * usually held at that time. To work around this, dirtying of formatted node
+ * simply increases ->nr_marked_dirty counter in the current reiser4
+ * context. When we are about to leave this context,
+ * balance_dirty_pages_ratelimited() is called, if necessary.
+ *
+ * This introduces another problem: sometimes we do not want to run
+ * balance_dirty_pages_ratelimited() when leaving a context, for example
+ * because some important lock (like ->i_mutex on the parent directory) is
+ * held. To achieve this, ->nobalance flag can be set in the current context.
+ */
+static void reiser4_throttle_write_at(reiser4_context *context)
+{
+ reiser4_super_info_data *sbinfo = get_super_private(context->super);
+
+ /*
+ * call balance_dirty_pages_ratelimited() to process formatted nodes
+ * dirtied during this system call. Do that only if we are not in mount
+ * and there were nodes dirtied in this context and we are not in
+ * writepage (to avoid deadlock) and not in pdflush
+ */
+ if (sbinfo != NULL && sbinfo->fake != NULL &&
+ context->nr_marked_dirty != 0 &&
+ !(current->flags & PF_MEMALLOC) &&
+ !context->flush_bd_task)
+ reiser4_throttle_write(sbinfo->fake);
+}
+
+/* release resources associated with context.
+
+ This function should be called at the end of "session" with reiser4,
+ typically just before leaving reiser4 driver back to VFS.
+
+ This is good place to put some degugging consistency checks, like that
+ thread released all locks and closed transcrash etc.
+
+*/
+static void reiser4_done_context(reiser4_context * context)
+ /* context being released */
+{
+ assert("nikita-860", context != NULL);
+ assert("nikita-859", context->magic == context_magic);
+ assert("vs-646", (reiser4_context *) current->journal_info == context);
+ assert("zam-686", !in_interrupt() && !in_irq());
+
+ /* only do anything when leaving top-level reiser4 context. All nested
+ * contexts are just dummies. */
+ if (context->nr_children == 0) {
+ assert("jmacd-673", context->trans == NULL);
+ assert("jmacd-1002", lock_stack_isclean(&context->stack));
+ assert("nikita-1936", reiser4_no_counters_are_held());
+ assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
+ assert("zam-1004", ergo(get_super_private(context->super),
+ get_super_private(context->super)->delete_mutex_owner !=
+ current));
+ /*
+ * release all grabbed but as yet unused blocks
+ */
+ all_grabbed2free();
+ /*
+ * synchronize against longterm_unlock_znode():
+ * wake_up_requestor() wakes up requestors without holding
+ * zlock (otherwise they will immediately bump into that lock
+ * after wake up on another CPU). To work around (rare)
+ * situation where requestor has been woken up asynchronously
+ * and managed to run until completion (and destroy its
+ * context and lock stack) before wake_up_requestor() called
+ * wake_up() on it, wake_up_requestor() synchronize on lock
+ * stack spin lock. It has actually been observed that spin
+ * lock _was_ locked at this point, because
+ * wake_up_requestor() took interrupt.
+ */
+ spin_lock_stack(&context->stack);
+ spin_unlock_stack(&context->stack);
+
+ assert("zam-684", context->nr_children == 0);
+ /*
+ * restore original ->fs_context value
+ */
+ current->journal_info = context->outer;
+ done_bricks_info(context);
+ if (context->on_stack == 0)
+ kfree(context);
+ } else {
+ context->nr_children--;
+#if REISER4_DEBUG
+ assert("zam-685", context->nr_children >= 0);
+#endif
+ }
+}
+
+/*
+ * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
+ * transaction. Call done_context() to do context related book-keeping.
+ */
+void reiser4_exit_context(reiser4_context * context)
+{
+ assert("nikita-3021", reiser4_schedulable());
+
+ if (context->nr_children == 0) {
+ if (!context->nobalance)
+ reiser4_throttle_write_at(context);
+
+ /* if filesystem is mounted with -o sync or -o dirsync - commit
+ transaction. FIXME: TXNH_DONT_COMMIT is used to avoid
+ commiting on exit_context when inode semaphore is held and
+ to have ktxnmgrd to do commit instead to get better
+ concurrent filesystem accesses. But, when one mounts with -o
+ sync, he cares more about reliability than about
+ performance. So, for now we have this simple mount -o sync
+ support. */
+ if (context->super->s_flags & (SB_SYNCHRONOUS | SB_DIRSYNC)) {
+ txn_atom *atom;
+
+ atom = get_current_atom_locked_nocheck();
+ if (atom) {
+ atom->flags |= ATOM_FORCE_COMMIT;
+ context->trans->flags &= ~TXNH_DONT_COMMIT;
+ spin_unlock_atom(atom);
+ }
+ }
+ reiser4_txn_end(context);
+ }
+ reiser4_done_context(context);
+}
+
+void reiser4_ctx_gfp_mask_set(void)
+{
+ reiser4_context *ctx;
+
+ ctx = get_current_context();
+ if (ctx->entd == 0 &&
+ list_empty(&ctx->stack.locks) &&
+ ctx->trans->atom == NULL)
+ ctx->gfp_mask = GFP_KERNEL;
+ else
+ ctx->gfp_mask = GFP_NOFS;
+}
+
+void reiser4_ctx_gfp_mask_force(gfp_t mask)
+{
+ reiser4_context *ctx;
+ ctx = get_current_context();
+
+ assert("edward-1454", ctx != NULL);
+
+ ctx->gfp_mask = mask;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/context.h linux-5.10.2/fs/reiser4/context.h
--- linux-5.10.2.orig/fs/reiser4/context.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/context.h 2020-12-23 16:08:55.164816614 +0100
@@ -0,0 +1,277 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Reiser4 context. See context.c for details. */
+
+#if !defined( __REISER4_CONTEXT_H__ )
+#define __REISER4_CONTEXT_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "tap.h"
+#include "lock.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/spinlock.h>
+#include <linux/sched.h> /* for struct task_struct */
+#include <linux/rbtree.h>
+
+/*
+ * Info specific for a child (stack element)
+ */
+struct ctx_stack_info {
+ struct ctx_stack_info *next; /* pointer to the next element
+ in the stack */
+ reiser4_subvol *data_subv; /* This is a hint for update_extents(s).
+ Set by set_current_data_subvol().
+ Check by validate_data_reservation().
+ Unset by clear_current_data_subvol()
+ as soon as it is not needed.
+ */
+ /* put here other info specific for reiser4 context nesting level */
+};
+
+/*
+ * Brick-specific part of context
+ */
+struct ctx_brick_info {
+ struct rb_node node;
+ u32 brick_id; /* key */
+ reiser4_block_nr grabbed_blocks;
+};
+
+/* reiser4 per-thread context */
+struct reiser4_context {
+ /* magic constant. For identification of reiser4 contexts. */
+ __u32 magic;
+
+ /* current lock stack. See lock.[ch]. This is where list of all
+ locks taken by current thread is kept. This is also used in
+ deadlock detection. */
+ lock_stack stack;
+
+ /* current transcrash. */
+ txn_handle *trans;
+ /* transaction handle embedded into reiser4_context. ->trans points
+ * here by default. */
+ txn_handle trans_in_ctx;
+
+ /* super block we are working with */
+ struct super_block *super;
+
+ /* parent fs activation */
+ struct fs_activation *outer;
+
+ /* brick-specific parts of the context for all the bricks which
+ participate in the transaction. Sorted by internal brick ID */
+ struct rb_root bricks_info;
+ struct ctx_brick_info mcbi; /* pre-allocated meta-brick info */
+
+ /* list of taps currently monitored. See tap.c */
+ struct list_head taps;
+
+ /* grabbing space is enabled */
+ unsigned int grab_enabled:1;
+ /* should be set when we are write dirty nodes to disk in jnode_flush or
+ * reiser4_write_logs() */
+ unsigned int writeout_mode:1;
+ /* true, if current thread is an ent thread */
+ unsigned int entd:1;
+ /* true, if balance_dirty_pages() should not be run when leaving this
+ * context. This is used to avoid lengthly balance_dirty_pages()
+ * operation when holding some important resource, like directory
+ * ->i_mutex */
+ unsigned int nobalance:1;
+ /* this bit is used on reiser4_done_context to decide whether context is
+ kmalloc-ed and has to be kfree-ed */
+ unsigned int on_stack:1;
+ /* file system is read-only */
+ unsigned int ro:1;
+ /* replacement of PF_FLUSHER */
+ unsigned int flush_bd_task:1;
+
+ /* count non-trivial jnode_set_dirty() calls */
+ unsigned long nr_marked_dirty;
+ /*
+ * reiser4_writeback_inodes calls (via generic_writeback_sb_inodes)
+ * reiser4_writepages_dispatch for each of dirty inodes.
+ * Reiser4_writepages_dispatch captures pages. When number of pages
+ * captured in one reiser4_writeback_inodes reaches some threshold -
+ * some atoms get flushed
+ */
+ int nr_captured;
+ int nr_children; /* number of child contexts */
+ struct page *locked_page; /* page that should be unlocked in
+ * reiser4_dirty_inode() before taking
+ * a longterm lock (to not violate
+ * reiser4 lock ordering) */
+#if REISER4_DEBUG
+ reiser4_lock_cnt_info locks; /* debugging information about reiser4
+ locks held by the current thread */
+ struct task_struct *task; /* so we can easily find owner of the stack */
+ struct list_head flushers_link; /* list of all threads doing
+ flush currently */
+ err_site err; /* information about last error encountered by reiser4 */
+#endif
+ void *vp;
+ gfp_t gfp_mask;
+};
+
+extern reiser4_context *get_context_by_lock_stack(lock_stack *);
+extern int ctx_brick_info_init_static(void);
+extern void ctx_brick_info_done_static(void);
+extern int ctx_stack_info_init_static(void);
+extern void ctx_stack_info_done_static(void);
+extern reiser4_subvol *get_current_data_subvol(void);
+extern void set_current_data_subvol(reiser4_subvol *subv);
+extern void clear_current_data_subvol(void);
+extern struct ctx_brick_info *find_context_brick_info(reiser4_context *ctx,
+ u32 brick_id);
+extern int insert_context_brick_info(reiser4_context *ctx,
+ struct ctx_brick_info *data);
+extern struct ctx_brick_info *alloc_context_brick_info(void);
+extern void free_context_brick_info(struct ctx_brick_info *cbi);
+static inline ctx_brick_info *context_meta_brick_info(reiser4_context *ctx)
+{
+ return &ctx->mcbi;
+}
+
+static inline void init_context_brick_info(struct ctx_brick_info *cbi,
+ u32 brick_id)
+{
+ memset(cbi, 0, sizeof(*cbi));
+ RB_CLEAR_NODE(&cbi->node);
+ cbi->brick_id = brick_id;
+}
+
+/* Debugging helps. */
+#if REISER4_DEBUG
+extern void print_contexts(void);
+#endif
+
+#define current_blocksize reiser4_get_current_sb()->s_blocksize
+#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
+#define current_tree(subvol_id) (&(current_origin(subvol_id)->tree))
+
+extern reiser4_context *reiser4_init_context(struct super_block *);
+extern void init_stack_context(reiser4_context *, struct super_block *);
+extern void reiser4_exit_context(reiser4_context *);
+
+/* magic constant we store in reiser4_context allocated at the stack. Used to
+ catch accesses to staled or uninitialized contexts. */
+#define context_magic ((__u32) 0x4b1b5d0b)
+
+extern int is_in_reiser4_context(void);
+
+/*
+ * return reiser4_context for the thread @tsk
+ */
+static inline reiser4_context *get_context(const struct task_struct *tsk)
+{
+ assert("vs-1682",
+ ((reiser4_context *) tsk->journal_info)->magic == context_magic);
+ return (reiser4_context *) tsk->journal_info;
+}
+
+/*
+ * return reiser4 context of the current thread, or NULL if there is none.
+ */
+static inline reiser4_context *get_current_context_check(void)
+{
+ if (is_in_reiser4_context())
+ return get_context(current);
+ else
+ return NULL;
+}
+
+static inline reiser4_context *get_current_context(void); /* __attribute__((const)); */
+
+/* return context associated with current thread */
+static inline reiser4_context *get_current_context(void)
+{
+ return get_context(current);
+}
+
+static inline gfp_t reiser4_ctx_gfp_mask_get(void)
+{
+ reiser4_context *ctx;
+
+ ctx = get_current_context_check();
+ return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
+}
+
+void reiser4_ctx_gfp_mask_set(void);
+void reiser4_ctx_gfp_mask_force (gfp_t mask);
+
+/*
+ * true if current thread is in the write-out mode. Thread enters write-out
+ * mode during jnode_flush and reiser4_write_logs().
+ */
+static inline int is_writeout_mode(void)
+{
+ return get_current_context()->writeout_mode;
+}
+
+/*
+ * enter write-out mode
+ */
+static inline void writeout_mode_enable(void)
+{
+ assert("zam-941", !get_current_context()->writeout_mode);
+ get_current_context()->writeout_mode = 1;
+}
+
+/*
+ * leave write-out mode
+ */
+static inline void writeout_mode_disable(void)
+{
+ assert("zam-942", get_current_context()->writeout_mode);
+ get_current_context()->writeout_mode = 0;
+}
+
+static inline void grab_space_enable(void)
+{
+ get_current_context()->grab_enabled = 1;
+}
+
+static inline void grab_space_disable(void)
+{
+ get_current_context()->grab_enabled = 0;
+}
+
+static inline void grab_space_set_enabled(int enabled)
+{
+ get_current_context()->grab_enabled = enabled;
+}
+
+static inline int is_grab_enabled(reiser4_context * ctx)
+{
+ return ctx->grab_enabled;
+}
+
+/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
+ * flush would be performed when it is closed. This is necessary when handle
+ * has to be closed under some coarse semaphore, like i_mutex of
+ * directory. Commit will be performed by ktxnmgrd. */
+static inline void context_set_commit_async(reiser4_context * context)
+{
+ context->nobalance = 1;
+ context->trans->flags |= TXNH_DONT_COMMIT;
+}
+
+/* __REISER4_CONTEXT_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/coord.c linux-5.10.2/fs/reiser4/coord.c
--- linux-5.10.2.orig/fs/reiser4/coord.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/coord.c 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,931 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "plugin/item/item.h"
+#include "znode.h"
+#include "coord.h"
+
+/* Internal constructor. */
+static inline void
+coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos,
+ pos_in_node_t unit_pos, between_enum between)
+{
+ coord->node = (znode *) node;
+ coord_set_item_pos(coord, item_pos);
+ coord->unit_pos = unit_pos;
+ coord->between = between;
+ ON_DEBUG(coord->plug_v = 0);
+ ON_DEBUG(coord->body_v = 0);
+
+ /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord,
+ node, item_pos, unit_pos, coord_tween_tostring (between)); */
+}
+
+/* after shifting of node content, coord previously set properly may become
+ invalid, try to "normalize" it. */
+void coord_normalize(coord_t *coord)
+{
+ znode *node;
+
+ node = coord->node;
+ assert("vs-683", node);
+
+ coord_clear_iplug(coord);
+
+ if (node_is_empty(node)) {
+ coord_init_first_unit(coord, node);
+ } else if ((coord->between == AFTER_ITEM)
+ || (coord->between == AFTER_UNIT)) {
+ return;
+ } else if (coord->item_pos == coord_num_items(coord)
+ && coord->between == BEFORE_ITEM) {
+ coord_dec_item_pos(coord);
+ coord->between = AFTER_ITEM;
+ } else if (coord->unit_pos == coord_num_units(coord)
+ && coord->between == BEFORE_UNIT) {
+ coord->unit_pos--;
+ coord->between = AFTER_UNIT;
+ } else if (coord->item_pos == coord_num_items(coord)
+ && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
+ coord_dec_item_pos(coord);
+ coord->unit_pos = 0;
+ coord->between = AFTER_ITEM;
+ }
+}
+
+/* Copy a coordinate. */
+void coord_dup(coord_t *coord, const coord_t *old_coord)
+{
+ assert("jmacd-9800", coord_check(old_coord));
+ coord_dup_nocheck(coord, old_coord);
+}
+
+/* Copy a coordinate without check. Useful when old_coord->node is not
+ loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
+void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord)
+{
+ coord->node = old_coord->node;
+ coord_set_item_pos(coord, old_coord->item_pos);
+ coord->unit_pos = old_coord->unit_pos;
+ coord->between = old_coord->between;
+ coord->iplugid = old_coord->iplugid;
+ ON_DEBUG(coord->plug_v = old_coord->plug_v);
+ ON_DEBUG(coord->body_v = old_coord->body_v);
+}
+
+/* Initialize an invalid coordinate. */
+void coord_init_invalid(coord_t *coord, const znode * node)
+{
+ coord_init_values(coord, node, 0, 0, INVALID_COORD);
+}
+
+void coord_init_first_unit_nocheck(coord_t *coord, const znode * node)
+{
+ coord_init_values(coord, node, 0, 0, AT_UNIT);
+}
+
+/* Initialize a coordinate to point at the first unit of the first item. If the
+ node is empty, it is positioned at the EMPTY_NODE. */
+void coord_init_first_unit(coord_t *coord, const znode * node)
+{
+ int is_empty = node_is_empty(node);
+
+ coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
+
+ assert("jmacd-9801", coord_check(coord));
+}
+
+/* Initialize a coordinate to point at the last unit of the last item. If the
+ node is empty, it is positioned at the EMPTY_NODE. */
+void coord_init_last_unit(coord_t *coord, const znode * node)
+{
+ int is_empty = node_is_empty(node);
+
+ coord_init_values(coord, node,
+ (is_empty ? 0 : node_num_items(node) - 1), 0,
+ (is_empty ? EMPTY_NODE : AT_UNIT));
+ if (!is_empty)
+ coord->unit_pos = coord_last_unit_pos(coord);
+ assert("jmacd-9802", coord_check(coord));
+}
+
+/* Initialize a coordinate to before the first item. If the node is empty, it is
+ positioned at the EMPTY_NODE. */
+void coord_init_before_first_item(coord_t *coord, const znode * node)
+{
+ int is_empty = node_is_empty(node);
+
+ coord_init_values(coord, node, 0, 0,
+ (is_empty ? EMPTY_NODE : BEFORE_UNIT));
+
+ assert("jmacd-9803", coord_check(coord));
+}
+
+/* Initialize a coordinate to after the last item. If the node is empty, it is
+ positioned at the EMPTY_NODE. */
+void coord_init_after_last_item(coord_t *coord, const znode * node)
+{
+ int is_empty = node_is_empty(node);
+
+ coord_init_values(coord, node,
+ (is_empty ? 0 : node_num_items(node) - 1), 0,
+ (is_empty ? EMPTY_NODE : AFTER_ITEM));
+
+ assert("jmacd-9804", coord_check(coord));
+}
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+ already to existing item */
+void coord_init_after_item_end(coord_t *coord)
+{
+ coord->between = AFTER_UNIT;
+ coord->unit_pos = coord_last_unit_pos(coord);
+}
+
+/* Initialize a coordinate to before the item. Coord must be set already to
+ existing item */
+void coord_init_before_item(coord_t *coord)
+{
+ coord->unit_pos = 0;
+ coord->between = BEFORE_ITEM;
+}
+
+/* Initialize a coordinate to after the item. Coord must be set already to
+ existing item */
+void coord_init_after_item(coord_t *coord)
+{
+ coord->unit_pos = 0;
+ coord->between = AFTER_ITEM;
+}
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+ it was not clear how actually */
+void coord_init_zero(coord_t *coord)
+{
+ memset(coord, 0, sizeof(*coord));
+}
+
+/* Return the number of units at the present item.
+ Asserts coord_is_existing_item(). */
+unsigned coord_num_units(const coord_t *coord)
+{
+ assert("jmacd-9806", coord_is_existing_item(coord));
+
+ return item_plugin_by_coord(coord)->b.nr_units(coord);
+}
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+/* Audited by: green(2002.06.15) */
+int coord_is_invalid(const coord_t *coord)
+{
+ return coord->between == INVALID_COORD;
+}
+
+/* Returns true if the coordinate is positioned at an existing item, not before
+ or after an item. It may be placed at, before, or after any unit within the
+ item, whether existing or not. */
+int coord_is_existing_item(const coord_t *coord)
+{
+ switch (coord->between) {
+ case EMPTY_NODE:
+ case BEFORE_ITEM:
+ case AFTER_ITEM:
+ case INVALID_COORD:
+ return 0;
+
+ case BEFORE_UNIT:
+ case AT_UNIT:
+ case AFTER_UNIT:
+ return coord->item_pos < coord_num_items(coord);
+ }
+
+ impossible("jmacd-9900", "unreachable coord: %p", coord);
+ return 0;
+}
+
+/* Returns true if the coordinate is positioned at an existing unit, not before
+ or after a unit. */
+/* Audited by: green(2002.06.15) */
+int coord_is_existing_unit(const coord_t *coord)
+{
+ switch (coord->between) {
+ case EMPTY_NODE:
+ case BEFORE_UNIT:
+ case AFTER_UNIT:
+ case BEFORE_ITEM:
+ case AFTER_ITEM:
+ case INVALID_COORD:
+ return 0;
+
+ case AT_UNIT:
+ return (coord->item_pos < coord_num_items(coord)
+ && coord->unit_pos < coord_num_units(coord));
+ }
+
+ impossible("jmacd-9902", "unreachable");
+ return 0;
+}
+
+/* Returns true if the coordinate is positioned at the first unit of the first
+ item. Not true for empty nodes nor coordinates positioned before the first
+ item. */
+/* Audited by: green(2002.06.15) */
+int coord_is_leftmost_unit(const coord_t *coord)
+{
+ return (coord->between == AT_UNIT && coord->item_pos == 0
+ && coord->unit_pos == 0);
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+int coord_check(const coord_t *coord)
+{
+ if (coord->node == NULL)
+ return 0;
+ if (znode_above_root(coord->node))
+ return 1;
+
+ switch (coord->between) {
+ default:
+ case INVALID_COORD:
+ return 0;
+ case EMPTY_NODE:
+ if (!node_is_empty(coord->node))
+ return 0;
+ return coord->item_pos == 0 && coord->unit_pos == 0;
+
+ case BEFORE_UNIT:
+ case AFTER_UNIT:
+ if (node_is_empty(coord->node) && (coord->item_pos == 0)
+ && (coord->unit_pos == 0))
+ return 1;
+ case AT_UNIT:
+ break;
+ case AFTER_ITEM:
+ case BEFORE_ITEM:
+ /* before/after item should not set unit_pos. */
+ if (coord->unit_pos != 0)
+ return 0;
+ break;
+ }
+
+ if (coord->item_pos >= node_num_items(coord->node))
+ return 0;
+
+ /* FIXME-VS: we are going to check unit_pos. This makes no sense when
+ between is set either AFTER_ITEM or BEFORE_ITEM */
+ if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
+ return 1;
+
+ if (coord_is_iplug_set(coord) &&
+ coord->unit_pos >
+ item_plugin_by_coord(coord)->b.nr_units(coord) - 1)
+ return 0;
+ return 1;
+}
+#endif
+
+/* Adjust coordinate boundaries based on the number of items prior to
+ coord_next/prev. Returns 1 if the new position is does not exist. */
+static int coord_adjust_items(coord_t *coord, unsigned items, int is_next)
+{
+ /* If the node is invalid, leave it. */
+ if (coord->between == INVALID_COORD)
+ return 1;
+
+ /* If the node is empty, set it appropriately. */
+ if (items == 0) {
+ coord->between = EMPTY_NODE;
+ coord_set_item_pos(coord, 0);
+ coord->unit_pos = 0;
+ return 1;
+ }
+
+ /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
+ if (coord->between == EMPTY_NODE) {
+ coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
+ coord_set_item_pos(coord, 0);
+ coord->unit_pos = 0;
+ return 0;
+ }
+
+ /* If the item_pos is out-of-range, set it appropriatly. */
+ if (coord->item_pos >= items) {
+ coord->between = AFTER_ITEM;
+ coord_set_item_pos(coord, items - 1);
+ coord->unit_pos = 0;
+ /* If is_next, return 1 (can't go any further). */
+ return is_next;
+ }
+
+ return 0;
+}
+
+/**
+ * Advances the coordinate by one unit to the right in a node.
+ * If node is empty, no change. If @coord is the rightmost unit
+ * in the node, advances to AFTER THE LAST ITEM. Returns 0 if
+ * new position is an existing unit
+ */
+int coord_next_unit(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 1) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case BEFORE_UNIT:
+ /* Now it is positioned at the same unit. */
+ coord->between = AT_UNIT;
+ return 0;
+
+ case AFTER_UNIT:
+ case AT_UNIT:
+ /* If it was at or after a unit and there are more units in this
+ item, advance to the next one. */
+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
+ coord->unit_pos += 1;
+ coord->between = AT_UNIT;
+ return 0;
+ }
+
+ /* Otherwise, it is crossing an item boundary and treated as if
+ it was after the current item. */
+ coord->between = AFTER_ITEM;
+ coord->unit_pos = 0;
+ /* FALLTHROUGH */
+
+ case AFTER_ITEM:
+ /* Check for end-of-node. */
+ if (coord->item_pos == items - 1)
+ return 1;
+
+ coord_inc_item_pos(coord);
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+ return 0;
+
+ case BEFORE_ITEM:
+ /* The adjust_items checks ensure that we are valid here. */
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+ return 0;
+
+ case INVALID_COORD:
+ case EMPTY_NODE:
+ /* Handled in coord_adjust_items(). */
+ break;
+ }
+
+ impossible("jmacd-9902", "unreachable");
+ return 0;
+}
+
+/* Advances the coordinate by one item to the right. If empty, no change. If
+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
+ position is an existing item. */
+int coord_next_item(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 1) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case AFTER_UNIT:
+ case AT_UNIT:
+ case BEFORE_UNIT:
+ case AFTER_ITEM:
+ /* Check for end-of-node. */
+ if (coord->item_pos == items - 1) {
+ coord->between = AFTER_ITEM;
+ coord->unit_pos = 0;
+ coord_clear_iplug(coord);
+ return 1;
+ }
+
+ /* Anywhere in an item, go to the next one. */
+ coord->between = AT_UNIT;
+ coord_inc_item_pos(coord);
+ coord->unit_pos = 0;
+ return 0;
+
+ case BEFORE_ITEM:
+ /* The out-of-range check ensures that we are valid here. */
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+ return 0;
+ case INVALID_COORD:
+ case EMPTY_NODE:
+ /* Handled in coord_adjust_items(). */
+ break;
+ }
+
+ impossible("jmacd-9903", "unreachable");
+ return 0;
+}
+
+/* Advances the coordinate by one unit to the left. If empty, no change. If
+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
+ position is an existing unit. */
+int coord_prev_unit(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 0) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case AT_UNIT:
+ case BEFORE_UNIT:
+ if (coord->unit_pos > 0) {
+ coord->unit_pos -= 1;
+ coord->between = AT_UNIT;
+ return 0;
+ }
+
+ if (coord->item_pos == 0) {
+ coord->between = BEFORE_ITEM;
+ return 1;
+ }
+
+ coord_dec_item_pos(coord);
+ coord->unit_pos = coord_last_unit_pos(coord);
+ coord->between = AT_UNIT;
+ return 0;
+
+ case AFTER_UNIT:
+ /* What if unit_pos is out-of-range? */
+ assert("jmacd-5442",
+ coord->unit_pos <= coord_last_unit_pos(coord));
+ coord->between = AT_UNIT;
+ return 0;
+
+ case BEFORE_ITEM:
+ if (coord->item_pos == 0)
+ return 1;
+
+ coord_dec_item_pos(coord);
+ /* FALLTHROUGH */
+
+ case AFTER_ITEM:
+ coord->between = AT_UNIT;
+ coord->unit_pos = coord_last_unit_pos(coord);
+ return 0;
+
+ case INVALID_COORD:
+ case EMPTY_NODE:
+ break;
+ }
+
+ impossible("jmacd-9904", "unreachable");
+ return 0;
+}
+
+/* Advances the coordinate by one item to the left. If empty, no change. If
+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
+ position is an existing item. */
+int coord_prev_item(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 0) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case AT_UNIT:
+ case AFTER_UNIT:
+ case BEFORE_UNIT:
+ case BEFORE_ITEM:
+
+ if (coord->item_pos == 0) {
+ coord->between = BEFORE_ITEM;
+ coord->unit_pos = 0;
+ return 1;
+ }
+
+ coord_dec_item_pos(coord);
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+ return 0;
+
+ case AFTER_ITEM:
+ coord->between = AT_UNIT;
+ coord->unit_pos = 0;
+ return 0;
+
+ case INVALID_COORD:
+ case EMPTY_NODE:
+ break;
+ }
+
+ impossible("jmacd-9905", "unreachable");
+ return 0;
+}
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on
+ sideof argument. */
+void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir)
+{
+ assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+ if (dir == LEFT_SIDE) {
+ coord_init_first_unit(coord, node);
+ } else {
+ coord_init_last_unit(coord, node);
+ }
+}
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
+ on sideof argument. */
+/* Audited by: green(2002.06.15) */
+int coord_is_after_sideof_unit(coord_t *coord, sideof dir)
+{
+ assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+ if (dir == LEFT_SIDE) {
+ return coord_is_before_leftmost(coord);
+ } else {
+ return coord_is_after_rightmost(coord);
+ }
+}
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument.
+ */
+/* Audited by: green(2002.06.15) */
+int coord_sideof_unit(coord_t *coord, sideof dir)
+{
+ assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
+ if (dir == LEFT_SIDE) {
+ return coord_prev_unit(coord);
+ } else {
+ return coord_next_unit(coord);
+ }
+}
+
+#if REISER4_DEBUG
+int coords_equal(const coord_t *c1, const coord_t *c2)
+{
+ assert("nikita-2840", c1 != NULL);
+ assert("nikita-2841", c2 != NULL);
+
+ return
+ c1->node == c2->node &&
+ c1->item_pos == c2->item_pos &&
+ c1->unit_pos == c2->unit_pos && c1->between == c2->between;
+}
+#endif /* REISER4_DEBUG */
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
+ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
+ NCOORD_INSIDE. */
+/* Audited by: green(2002.06.15) */
+coord_wrt_node coord_wrt(const coord_t *coord)
+{
+ if (coord_is_before_leftmost(coord))
+ return COORD_ON_THE_LEFT;
+
+ if (coord_is_after_rightmost(coord))
+ return COORD_ON_THE_RIGHT;
+
+ return COORD_INSIDE;
+}
+
+/* Returns true if the coordinate is positioned after the last item or after the
+ last unit of the last item or it is an empty node. */
+/* Audited by: green(2002.06.15) */
+int coord_is_after_rightmost(const coord_t *coord)
+{
+ assert("jmacd-7313", coord_check(coord));
+
+ switch (coord->between) {
+ case INVALID_COORD:
+ case AT_UNIT:
+ case BEFORE_UNIT:
+ case BEFORE_ITEM:
+ return 0;
+
+ case EMPTY_NODE:
+ return 1;
+
+ case AFTER_ITEM:
+ return (coord->item_pos == node_num_items(coord->node) - 1);
+
+ case AFTER_UNIT:
+ return ((coord->item_pos == node_num_items(coord->node) - 1) &&
+ coord->unit_pos == coord_last_unit_pos(coord));
+ }
+
+ impossible("jmacd-9908", "unreachable");
+ return 0;
+}
+
+/* Returns true if the coordinate is positioned before the first item or it is
+ an empty node. */
+int coord_is_before_leftmost(const coord_t *coord)
+{
+ /* FIXME-VS: coord_check requires node to be loaded whereas it is not
+ necessary to check if coord is set before leftmost
+ assert ("jmacd-7313", coord_check (coord)); */
+ switch (coord->between) {
+ case INVALID_COORD:
+ case AT_UNIT:
+ case AFTER_ITEM:
+ case AFTER_UNIT:
+ return 0;
+
+ case EMPTY_NODE:
+ return 1;
+
+ case BEFORE_ITEM:
+ case BEFORE_UNIT:
+ return (coord->item_pos == 0) && (coord->unit_pos == 0);
+ }
+
+ impossible("jmacd-9908", "unreachable");
+ return 0;
+}
+
+/* Returns true if the coordinate is positioned after a item, before a item,
+ after the last unit of an item, before the first unit of an item, or at an
+ empty node. */
+/* Audited by: green(2002.06.15) */
+int coord_is_between_items(const coord_t *coord)
+{
+ assert("jmacd-7313", coord_check(coord));
+
+ switch (coord->between) {
+ case INVALID_COORD:
+ case AT_UNIT:
+ return 0;
+
+ case AFTER_ITEM:
+ case BEFORE_ITEM:
+ case EMPTY_NODE:
+ return 1;
+
+ case BEFORE_UNIT:
+ return coord->unit_pos == 0;
+
+ case AFTER_UNIT:
+ return coord->unit_pos == coord_last_unit_pos(coord);
+ }
+
+ impossible("jmacd-9908", "unreachable");
+ return 0;
+}
+
+#if REISER4_DEBUG
+/* Returns true if the coordinates are positioned at adjacent units, regardless
+ of before-after or item boundaries. */
+int coord_are_neighbors(coord_t *c1, coord_t *c2)
+{
+ coord_t *left;
+ coord_t *right;
+
+ assert("nikita-1241", c1 != NULL);
+ assert("nikita-1242", c2 != NULL);
+ assert("nikita-1243", c1->node == c2->node);
+ assert("nikita-1244", coord_is_existing_unit(c1));
+ assert("nikita-1245", coord_is_existing_unit(c2));
+
+ left = right = NULL;
+ switch (coord_compare(c1, c2)) {
+ case COORD_CMP_ON_LEFT:
+ left = c1;
+ right = c2;
+ break;
+ case COORD_CMP_ON_RIGHT:
+ left = c2;
+ right = c1;
+ break;
+ case COORD_CMP_SAME:
+ return 0;
+ default:
+ wrong_return_value("nikita-1246", "compare_coords()");
+ }
+ assert("vs-731", left && right);
+ if (left->item_pos == right->item_pos) {
+ return left->unit_pos + 1 == right->unit_pos;
+ } else if (left->item_pos + 1 == right->item_pos) {
+ return (left->unit_pos == coord_last_unit_pos(left))
+ && (right->unit_pos == 0);
+ } else {
+ return 0;
+ }
+}
+#endif /* REISER4_DEBUG */
+
+/* Assuming two coordinates are positioned in the same node, return
+ COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's
+ position relative to c2. */
+/* Audited by: green(2002.06.15) */
+coord_cmp coord_compare(coord_t *c1, coord_t *c2)
+{
+ assert("vs-209", c1->node == c2->node);
+ assert("vs-194", coord_is_existing_unit(c1)
+ && coord_is_existing_unit(c2));
+
+ if (c1->item_pos > c2->item_pos)
+ return COORD_CMP_ON_RIGHT;
+ if (c1->item_pos < c2->item_pos)
+ return COORD_CMP_ON_LEFT;
+ if (c1->unit_pos > c2->unit_pos)
+ return COORD_CMP_ON_RIGHT;
+ if (c1->unit_pos < c2->unit_pos)
+ return COORD_CMP_ON_LEFT;
+ return COORD_CMP_SAME;
+}
+
+/* If the coordinate is between items, shifts it to the right. Returns 0 on
+ success and non-zero if there is no position to the right. */
+int coord_set_to_right(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 1) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case AT_UNIT:
+ return 0;
+
+ case BEFORE_ITEM:
+ case BEFORE_UNIT:
+ coord->between = AT_UNIT;
+ return 0;
+
+ case AFTER_UNIT:
+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
+ coord->unit_pos += 1;
+ coord->between = AT_UNIT;
+ return 0;
+ } else {
+
+ coord->unit_pos = 0;
+
+ if (coord->item_pos == items - 1) {
+ coord->between = AFTER_ITEM;
+ return 1;
+ }
+
+ coord_inc_item_pos(coord);
+ coord->between = AT_UNIT;
+ return 0;
+ }
+
+ case AFTER_ITEM:
+ if (coord->item_pos == items - 1)
+ return 1;
+
+ coord_inc_item_pos(coord);
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+ return 0;
+
+ case EMPTY_NODE:
+ return 1;
+
+ case INVALID_COORD:
+ break;
+ }
+
+ impossible("jmacd-9920", "unreachable");
+ return 0;
+}
+
+/* If the coordinate is between items, shifts it to the left. Returns 0 on
+ success and non-zero if there is no position to the left. */
+int coord_set_to_left(coord_t *coord)
+{
+ unsigned items = coord_num_items(coord);
+
+ if (coord_adjust_items(coord, items, 0) == 1)
+ return 1;
+
+ switch (coord->between) {
+ case AT_UNIT:
+ return 0;
+
+ case AFTER_UNIT:
+ coord->between = AT_UNIT;
+ return 0;
+
+ case AFTER_ITEM:
+ coord->between = AT_UNIT;
+ coord->unit_pos = coord_last_unit_pos(coord);
+ return 0;
+
+ case BEFORE_UNIT:
+ if (coord->unit_pos > 0) {
+ coord->unit_pos -= 1;
+ coord->between = AT_UNIT;
+ return 0;
+ } else {
+
+ if (coord->item_pos == 0) {
+ coord->between = BEFORE_ITEM;
+ return 1;
+ }
+
+ coord->unit_pos = coord_last_unit_pos(coord);
+ coord_dec_item_pos(coord);
+ coord->between = AT_UNIT;
+ return 0;
+ }
+
+ case BEFORE_ITEM:
+ if (coord->item_pos == 0)
+ return 1;
+
+ coord_dec_item_pos(coord);
+ coord->unit_pos = coord_last_unit_pos(coord);
+ coord->between = AT_UNIT;
+ return 0;
+
+ case EMPTY_NODE:
+ return 1;
+
+ case INVALID_COORD:
+ break;
+ }
+
+ impossible("jmacd-9920", "unreachable");
+ return 0;
+}
+
+static const char *coord_tween_tostring(between_enum n)
+{
+ switch (n) {
+ case BEFORE_UNIT:
+ return "before unit";
+ case BEFORE_ITEM:
+ return "before item";
+ case AT_UNIT:
+ return "at unit";
+ case AFTER_UNIT:
+ return "after unit";
+ case AFTER_ITEM:
+ return "after item";
+ case EMPTY_NODE:
+ return "empty node";
+ case INVALID_COORD:
+ return "invalid";
+ default:
+ {
+ static char buf[30];
+
+ sprintf(buf, "unknown: %i", n);
+ return buf;
+ }
+ }
+}
+
+void print_coord(const char *mes, const coord_t *coord, int node)
+{
+ if (coord == NULL) {
+ printk("%s: null\n", mes);
+ return;
+ }
+ printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
+ mes, coord->item_pos, coord->unit_pos,
+ coord_tween_tostring(coord->between), coord->iplugid);
+}
+
+int
+item_utmost_child_real_block(const coord_t *coord, sideof side,
+ reiser4_block_nr * blk)
+{
+ return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
+ side,
+ blk);
+}
+
+int item_utmost_child(const coord_t *coord, sideof side, jnode ** child)
+{
+ return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
+}
+
+/* @count bytes of flow @f got written, update correspondingly f->length,
+ f->data and f->key */
+void move_flow_forward(flow_t *f, unsigned count)
+{
+ if (f->data)
+ f->data += count;
+ f->length -= count;
+ set_key_offset(&f->key, get_key_offset(&f->key) + count);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/coord.h linux-5.10.2/fs/reiser4/coord.h
--- linux-5.10.2.orig/fs/reiser4/coord.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/coord.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,399 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Coords */
+
+#if !defined(__REISER4_COORD_H__)
+#define __REISER4_COORD_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+
+/* insertions happen between coords in the tree, so we need some means
+ of specifying the sense of betweenness. */
+typedef enum {
+ BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
+ AT_UNIT,
+ AFTER_UNIT,
+ BEFORE_ITEM,
+ AFTER_ITEM,
+ INVALID_COORD,
+ EMPTY_NODE,
+} between_enum;
+
+/* location of coord w.r.t. its node */
+typedef enum {
+ COORD_ON_THE_LEFT = -1,
+ COORD_ON_THE_RIGHT = +1,
+ COORD_INSIDE = 0
+} coord_wrt_node;
+
+typedef enum {
+ COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
+} coord_cmp;
+
+struct coord {
+ /* node in a tree */
+ /* 0 */ znode *node;
+
+ /* position of item within node */
+ /* 4 */ pos_in_node_t item_pos;
+ /* position of unit within item */
+ /* 6 */ pos_in_node_t unit_pos;
+ /* optimization: plugin of item is stored in coord_t. Until this was
+ implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
+ is invalidated (set to 0xff) on each modification of ->item_pos,
+ and all such modifications are funneled through coord_*_item_pos()
+ functions below.
+ */
+ /* 8 */ char iplugid;
+ /* position of coord w.r.t. to neighboring items and/or units.
+ Values are taken from &between_enum above.
+ */
+ /* 9 */ char between;
+ /* padding. It will be added by the compiler anyway to conform to the
+ * C language alignment requirements. We keep it here to be on the
+ * safe side and to have a clear picture of the memory layout of this
+ * structure. */
+ /* 10 */ __u16 pad;
+ /* 12 */ int offset;
+#if REISER4_DEBUG
+ unsigned long plug_v;
+ unsigned long body_v;
+#endif
+};
+
+#define INVALID_PLUGID ((char)((1 << 8) - 1))
+#define INVALID_OFFSET -1
+
+static inline void coord_clear_iplug(coord_t *coord)
+{
+ assert("nikita-2835", coord != NULL);
+ coord->iplugid = INVALID_PLUGID;
+ coord->offset = INVALID_OFFSET;
+}
+
+static inline int coord_is_iplug_set(const coord_t *coord)
+{
+ assert("nikita-2836", coord != NULL);
+ return coord->iplugid != INVALID_PLUGID;
+}
+
+static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos)
+{
+ assert("nikita-2478", coord != NULL);
+ coord->item_pos = pos;
+ coord_clear_iplug(coord);
+}
+
+static inline void coord_dec_item_pos(coord_t *coord)
+{
+ assert("nikita-2480", coord != NULL);
+ --coord->item_pos;
+ coord_clear_iplug(coord);
+}
+
+static inline void coord_inc_item_pos(coord_t *coord)
+{
+ assert("nikita-2481", coord != NULL);
+ ++coord->item_pos;
+ coord_clear_iplug(coord);
+}
+
+static inline void coord_add_item_pos(coord_t *coord, int delta)
+{
+ assert("nikita-2482", coord != NULL);
+ coord->item_pos += delta;
+ coord_clear_iplug(coord);
+}
+
+static inline void coord_invalid_item_pos(coord_t *coord)
+{
+ assert("nikita-2832", coord != NULL);
+ coord->item_pos = (unsigned short)~0;
+ coord_clear_iplug(coord);
+}
+
+/* Reverse a direction. */
+static inline sideof sideof_reverse(sideof side)
+{
+ return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
+}
+
+/* NOTE: There is a somewhat odd mixture of the following opposed terms:
+
+ "first" and "last"
+ "next" and "prev"
+ "before" and "after"
+ "leftmost" and "rightmost"
+
+ But I think the chosen names are decent the way they are.
+*/
+
+/* COORD INITIALIZERS */
+
+/* Initialize an invalid coordinate. */
+extern void coord_init_invalid(coord_t *coord, const znode * node);
+
+extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node);
+
+/* Initialize a coordinate to point at the first unit of the first item. If the
+ node is empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_first_unit(coord_t *coord, const znode * node);
+
+/* Initialize a coordinate to point at the last unit of the last item. If the
+ node is empty, it is positioned at the EMPTY_NODE. */
+extern void coord_init_last_unit(coord_t *coord, const znode * node);
+
+/* Initialize a coordinate to before the first item. If the node is empty, it is
+ positioned at the EMPTY_NODE. */
+extern void coord_init_before_first_item(coord_t *coord, const znode * node);
+
+/* Initialize a coordinate to after the last item. If the node is empty, it is
+ positioned at the EMPTY_NODE. */
+extern void coord_init_after_last_item(coord_t *coord, const znode * node);
+
+/* Initialize a coordinate to after last unit in the item. Coord must be set
+ already to existing item */
+void coord_init_after_item_end(coord_t *coord);
+
+/* Initialize a coordinate to before the item. Coord must be set already to
+ existing item */
+void coord_init_before_item(coord_t *);
+/* Initialize a coordinate to after the item. Coord must be set already to
+ existing item */
+void coord_init_after_item(coord_t *);
+
+/* Calls either coord_init_first_unit or coord_init_last_unit depending on
+ sideof argument. */
+extern void coord_init_sideof_unit(coord_t *coord, const znode * node,
+ sideof dir);
+
+/* Initialize a coordinate by 0s. Used in places where init_coord was used and
+ it was not clear how actually
+ FIXME-VS: added by vs (2002, june, 8) */
+extern void coord_init_zero(coord_t *coord);
+
+/* COORD METHODS */
+
+/* after shifting of node content, coord previously set properly may become
+ invalid, try to "normalize" it. */
+void coord_normalize(coord_t *coord);
+
+/* Copy a coordinate. */
+extern void coord_dup(coord_t *coord, const coord_t *old_coord);
+
+/* Copy a coordinate without check. */
+void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord);
+
+unsigned coord_num_units(const coord_t *coord);
+
+/* Return the last valid unit number at the present item (i.e.,
+ coord_num_units() - 1). */
+static inline unsigned coord_last_unit_pos(const coord_t *coord)
+{
+ return coord_num_units(coord) - 1;
+}
+
+#if REISER4_DEBUG
+/* For assertions only, checks for a valid coordinate. */
+extern int coord_check(const coord_t *coord);
+
+extern unsigned long znode_times_locked(const znode * z);
+
+static inline void coord_update_v(coord_t *coord)
+{
+ coord->plug_v = coord->body_v = znode_times_locked(coord->node);
+}
+#endif
+
+extern int coords_equal(const coord_t *c1, const coord_t *c2);
+
+extern void print_coord(const char *mes, const coord_t *coord, int print_node);
+
+/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
+ coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
+ NCOORD_INSIDE. */
+extern coord_wrt_node coord_wrt(const coord_t *coord);
+
+/* Returns true if the coordinates are positioned at adjacent units, regardless
+ of before-after or item boundaries. */
+extern int coord_are_neighbors(coord_t *c1, coord_t *c2);
+
+/* Assuming two coordinates are positioned in the same node, return
+ NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's
+ position relative to c2. */
+extern coord_cmp coord_compare(coord_t *c1, coord_t *c2);
+
+/* COORD PREDICATES */
+
+/* Returns true if the coord was initializewd by coord_init_invalid (). */
+extern int coord_is_invalid(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned at an existing item, not before
+ or after an item. It may be placed at, before, or after any unit within the
+ item, whether existing or not. If this is true you can call methods of the
+ item plugin. */
+extern int coord_is_existing_item(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned after a item, before a item,
+ after the last unit of an item, before the first unit of an item, or at an
+ empty node. */
+extern int coord_is_between_items(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned at an existing unit, not before
+ or after a unit. */
+extern int coord_is_existing_unit(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned at an empty node. */
+extern int coord_is_empty(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned at the first unit of the first
+ item. Not true for empty nodes nor coordinates positioned before the first
+ item. */
+extern int coord_is_leftmost_unit(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned after the last item or after the
+ last unit of the last item or it is an empty node. */
+extern int coord_is_after_rightmost(const coord_t *coord);
+
+/* Returns true if the coordinate is positioned before the first item or it is
+ an empty node. */
+extern int coord_is_before_leftmost(const coord_t *coord);
+
+/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
+ on sideof argument. */
+extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir);
+
+/* COORD MODIFIERS */
+
+/* Advances the coordinate by one unit to the right. If empty, no change. If
+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
+ position is an existing unit. */
+extern int coord_next_unit(coord_t *coord);
+
+/* Advances the coordinate by one item to the right. If empty, no change. If
+ coord_is_rightmost_unit, advances to AFTER THE LAST ITEM. Returns 0 if new
+ position is an existing item. */
+extern int coord_next_item(coord_t *coord);
+
+/* Advances the coordinate by one unit to the left. If empty, no change. If
+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
+ position is an existing unit. */
+extern int coord_prev_unit(coord_t *coord);
+
+/* Advances the coordinate by one item to the left. If empty, no change. If
+ coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
+ position is an existing item. */
+extern int coord_prev_item(coord_t *coord);
+
+/* If the coordinate is between items, shifts it to the right. Returns 0 on
+ success and non-zero if there is no position to the right. */
+extern int coord_set_to_right(coord_t *coord);
+
+/* If the coordinate is between items, shifts it to the left. Returns 0 on
+ success and non-zero if there is no position to the left. */
+extern int coord_set_to_left(coord_t *coord);
+
+/* If the coordinate is at an existing unit, set to after that unit. Returns 0
+ on success and non-zero if the unit did not exist. */
+extern int coord_set_after_unit(coord_t *coord);
+
+/* Calls either coord_next_unit or coord_prev_unit depending on sideof
+ argument. */
+extern int coord_sideof_unit(coord_t *coord, sideof dir);
+
+/* iterate over all units in @node */
+#define for_all_units(coord, node) \
+ for (coord_init_before_first_item((coord), (node)) ; \
+ coord_next_unit(coord) == 0 ;)
+
+/* iterate over all items in @node */
+#define for_all_items(coord, node) \
+ for (coord_init_before_first_item((coord), (node)) ; \
+ coord_next_item(coord) == 0 ;)
+
+/* COORD/ITEM METHODS */
+
+extern int item_utmost_child_real_block(const coord_t *coord, sideof side,
+ reiser4_block_nr * blk);
+extern int item_utmost_child(const coord_t *coord, sideof side,
+ jnode ** child);
+
+/* a flow is a sequence of bytes being written to or read from the tree. The
+ tree will slice the flow into items while storing it into nodes, but all of
+ that is hidden from anything outside the tree. */
+
+struct flow {
+ reiser4_key key; /* key of start of flow's sequence of bytes */
+ loff_t length; /* length of flow's sequence of bytes */
+ char *data; /* start of flow's sequence of bytes */
+ int user; /* if 1 data is user space, 0 - kernel space */
+ rw_op op; /* NIKITA-FIXME-HANS: comment is where? */
+};
+
+void move_flow_forward(flow_t *f, unsigned count);
+
+/* &reiser4_item_data - description of data to be inserted or pasted
+
+ Q: articulate the reasons for the difference between this and flow.
+
+ A: Becides flow we insert into tree other things: stat data, directory
+ entry, etc. To insert them into tree one has to provide this structure. If
+ one is going to insert flow - he can use insert_flow, where this structure
+ does not have to be created
+*/
+struct reiser4_item_data {
+ /* actual data to be inserted. If NULL, ->create_item() will not
+ do xmemcpy itself, leaving this up to the caller. This can
+ save some amount of unnecessary memory copying, for example,
+ during insertion of stat data.
+
+ */
+ char *data;
+ /* 1 if 'char * data' contains pointer to user space and 0 if it is
+ kernel space */
+ int user;
+ /* amount of data we are going to insert or paste */
+ int length;
+ /* "Arg" is opaque data that is passed down to the
+ ->create_item() method of node layout, which in turn
+ hands it to the ->create_hook() of item being created. This
+ arg is currently used by:
+
+ . ->create_hook() of internal item
+ (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
+ . ->paste() method of directory item.
+ . ->create_hook() of extent item
+
+ For internal item, this is left "brother" of new node being
+ inserted and it is used to add new node into sibling list
+ after parent to it was just inserted into parent.
+
+ While ->arg does look somewhat of unnecessary compication,
+ it actually saves a lot of headache in many places, because
+ all data necessary to insert or paste new data into tree are
+ collected in one place, and this eliminates a lot of extra
+ argument passing and storing everywhere.
+
+ */
+ void *arg;
+ /* plugin of item we are inserting */
+ item_plugin *iplug;
+};
+
+/* __REISER4_COORD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/debug.c linux-5.10.2/fs/reiser4/debug.c
--- linux-5.10.2.orig/fs/reiser4/debug.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/debug.c 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,310 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Debugging facilities. */
+
+/*
+ * This file contains generic debugging functions used by reiser4. Roughly
+ * following:
+ *
+ * panicking: reiser4_do_panic(), reiser4_print_prefix().
+ *
+ * locking:
+ * reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
+ * reiser4_no_counters_are_held(), reiser4_commit_check_locks()
+ *
+ * error code monitoring (see comment before RETERR macro):
+ * reiser4_return_err(), reiser4_report_err().
+ *
+ * stack back-tracing: fill_backtrace()
+ *
+ * miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
+ * reiser4_debugtrap().
+ *
+ */
+
+#include "reiser4.h"
+#include "context.h"
+#include "super.h"
+#include "txnmgr.h"
+#include "znode.h"
+
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/vmalloc.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/hardirq.h>
+#include <linux/sched/signal.h> /* signal_pending() */
+
+#if 0
+#if REISER4_DEBUG
+static void reiser4_report_err(void);
+#else
+#define reiser4_report_err() noop
+#endif
+#endif /* 0 */
+
+/*
+ * global buffer where message given to reiser4_panic is formatted.
+ */
+static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
+
+/*
+ * lock protecting consistency of panic_buf under concurrent panics
+ */
+static DEFINE_SPINLOCK(panic_guard);
+
+/* Your best friend. Call it on each occasion. This is called by
+ fs/reiser4/debug.h:reiser4_panic(). */
+void reiser4_do_panic(const char *format/* format string */ , ... /* rest */)
+{
+ static int in_panic = 0;
+ va_list args;
+
+ /*
+ * check for recursive panic.
+ */
+ if (in_panic == 0) {
+ in_panic = 1;
+
+ spin_lock(&panic_guard);
+ va_start(args, format);
+ vsnprintf(panic_buf, sizeof(panic_buf), format, args);
+ va_end(args);
+ printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
+ spin_unlock(&panic_guard);
+
+ /*
+ * if kernel debugger is configured---drop in. Early dropping
+ * into kgdb is not always convenient, because panic message
+ * is not yet printed most of the times. But:
+ *
+ * (1) message can be extracted from printk_buf[]
+ * (declared static inside of printk()), and
+ *
+ * (2) sometimes serial/kgdb combo dies while printing
+ * long panic message, so it's more prudent to break into
+ * debugger earlier.
+ *
+ */
+ DEBUGON(1);
+ }
+ /* to make gcc happy about noreturn attribute */
+ panic("%s", panic_buf);
+}
+
+#if 0
+void
+reiser4_print_prefix(const char *level, int reperr, const char *mid,
+ const char *function, const char *file, int lineno)
+{
+ const char *comm;
+ int pid;
+
+ if (unlikely(in_interrupt() || in_irq())) {
+ comm = "interrupt";
+ pid = 0;
+ } else {
+ comm = current->comm;
+ pid = current->pid;
+ }
+ printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
+ level, comm, pid, function, file, lineno, mid);
+ if (reperr)
+ reiser4_report_err();
+}
+#endif /* 0 */
+
+/* Preemption point: this should be called periodically during long running
+ operations (carry, allocate, and squeeze are best examples) */
+int reiser4_preempt_point(void)
+{
+ assert("nikita-3008", reiser4_schedulable());
+ cond_resched();
+ return signal_pending(current);
+}
+
+#if REISER4_DEBUG
+/* Debugging aid: return struct where information about locks taken by current
+ thread is accumulated. This can be used to formulate lock ordering
+ constraints and various assertions.
+
+*/
+reiser4_lock_cnt_info *reiser4_lock_counters(void)
+{
+ reiser4_context *ctx = get_current_context();
+ assert("jmacd-1123", ctx != NULL);
+ return &ctx->locks;
+}
+
+/*
+ * print human readable information about locks held by the reiser4 context.
+ */
+static void print_lock_counters(const char *prefix,
+ const reiser4_lock_cnt_info * info)
+{
+ printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
+ "jload: %i, "
+ "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
+ "ktxnmgrd: %i, fq: %i\n"
+ "inode: %i, "
+ "cbk_cache: %i (r:%i,w%i), "
+ "eflush: %i, "
+ "zlock: %i,\n"
+ "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
+ "d: %i, x: %i, t: %i\n", prefix,
+ info->spin_locked_jnode,
+ info->rw_locked_tree, info->read_locked_tree,
+ info->write_locked_tree,
+ info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
+ info->spin_locked_jload,
+ info->spin_locked_txnh,
+ info->spin_locked_atom, info->spin_locked_stack,
+ info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
+ info->spin_locked_fq,
+ info->spin_locked_inode,
+ info->rw_locked_cbk_cache,
+ info->read_locked_cbk_cache,
+ info->write_locked_cbk_cache,
+ info->spin_locked_super_eflush,
+ info->spin_locked_zlock,
+ info->spin_locked,
+ info->long_term_locked_znode,
+ info->inode_sem_r, info->inode_sem_w,
+ info->d_refs, info->x_refs, info->t_refs);
+}
+
+/* check that no spinlocks are held */
+int reiser4_schedulable(void)
+{
+ if (get_current_context_check() != NULL) {
+ if (!LOCK_CNT_NIL(spin_locked)) {
+ print_lock_counters("in atomic", reiser4_lock_counters());
+ return 0;
+ }
+ }
+ might_sleep();
+ return 1;
+}
+/*
+ * return true, iff no locks are held.
+ */
+int reiser4_no_counters_are_held(void)
+{
+ reiser4_lock_cnt_info *counters;
+
+ counters = reiser4_lock_counters();
+ return
+ (counters->spin_locked_zlock == 0) &&
+ (counters->spin_locked_jnode == 0) &&
+ (counters->rw_locked_tree == 0) &&
+ (counters->read_locked_tree == 0) &&
+ (counters->write_locked_tree == 0) &&
+ (counters->rw_locked_dk == 0) &&
+ (counters->read_locked_dk == 0) &&
+ (counters->write_locked_dk == 0) &&
+ (counters->spin_locked_txnh == 0) &&
+ (counters->spin_locked_atom == 0) &&
+ (counters->spin_locked_stack == 0) &&
+ (counters->spin_locked_txnmgr == 0) &&
+ (counters->spin_locked_inode == 0) &&
+ (counters->spin_locked == 0) &&
+ (counters->long_term_locked_znode == 0) &&
+ (counters->inode_sem_r == 0) &&
+ (counters->inode_sem_w == 0) && (counters->d_refs == 0);
+}
+
+/*
+ * return true, iff transaction commit can be done under locks held by the
+ * current thread.
+ */
+int reiser4_commit_check_locks(void)
+{
+ reiser4_lock_cnt_info *counters;
+ int inode_sem_r;
+ int inode_sem_w;
+ int result;
+
+ /*
+ * inode's read/write semaphore is the only reiser4 lock that can be
+ * held during commit.
+ */
+
+ counters = reiser4_lock_counters();
+ inode_sem_r = counters->inode_sem_r;
+ inode_sem_w = counters->inode_sem_w;
+
+ counters->inode_sem_r = counters->inode_sem_w = 0;
+ result = reiser4_no_counters_are_held();
+ counters->inode_sem_r = inode_sem_r;
+ counters->inode_sem_w = inode_sem_w;
+ return result;
+}
+
+/*
+ * fill "error site" in the current reiser4 context. See comment before RETERR
+ * macro for more details.
+ */
+void reiser4_return_err(int code, const char *file, int line)
+{
+ if (code < 0 && is_in_reiser4_context()) {
+ reiser4_context *ctx = get_current_context();
+
+ if (ctx != NULL) {
+ ctx->err.code = code;
+ ctx->err.file = file;
+ ctx->err.line = line;
+ }
+ }
+}
+
+#if 0
+/*
+ * report error information recorder by reiser4_return_err().
+ */
+static void reiser4_report_err(void)
+{
+ reiser4_context *ctx = get_current_context_check();
+
+ if (ctx != NULL) {
+ if (ctx->err.code != 0) {
+ printk("code: %i at %s:%i\n",
+ ctx->err.code, ctx->err.file, ctx->err.line);
+ }
+ }
+}
+#endif /* 0 */
+
+#endif /* REISER4_DEBUG */
+
+#if KERNEL_DEBUGGER
+
+/*
+ * this functions just drops into kernel debugger. It is a convenient place to
+ * put breakpoint in.
+ */
+void reiser4_debugtrap(void)
+{
+ /* do nothing. Put break point here. */
+#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
+ extern void kgdb_breakpoint(void);
+ //kgdb_breakpoint();
+ ;
+#endif
+}
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/debug.h linux-5.10.2/fs/reiser4/debug.h
--- linux-5.10.2.orig/fs/reiser4/debug.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/debug.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,344 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Declarations of debug macros. */
+
+#if !defined(__FS_REISER4_DEBUG_H__)
+#define __FS_REISER4_DEBUG_H__
+
+#include "forward.h"
+#include "reiser4.h"
+
+/**
+ * generic function to produce formatted output, decorating it with
+ * whatever standard prefixes/postfixes we want. "Fun" is a function
+ * that will be actually called, can be printk, panic etc.
+ * This is for use by other debugging macros, not by users.
+ */
+#define DCALL(lev, fun, reperr, label, format, ...) \
+({ \
+ fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" , \
+ current->comm, current->pid, __FUNCTION__, \
+ __FILE__, __LINE__, label, ## __VA_ARGS__); \
+})
+
+/*
+ * cause kernel to crash
+ */
+#define reiser4_panic(mid, format, ...) \
+ DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
+
+/* print message with indication of current process, file, line and
+ function */
+#define reiser4_log(label, format, ...) \
+ DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
+
+#define noop do {; } while (0)
+
+#if REISER4_DEBUG
+/* version of info that only actually prints anything when _d_ebugging
+ is on */
+#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
+/* macro to catch logical errors. Put it into `default' clause of
+ switch() statement. */
+#define impossible(label, format, ...) \
+ reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
+/* assert assures that @cond is true. If it is not, reiser4_panic() is
+ called. Use this for checking logical consistency and _never_ call
+ this to check correctness of external data: disk blocks and user-input . */
+#define assert(label, cond) \
+({ \
+ /* call_on_each_assert(); */ \
+ if (cond) { \
+ /* put negated check to avoid using !(cond) that would lose \
+ * warnings for things like assert(a = b); */ \
+ ; \
+ } else { \
+ DEBUGON(1); \
+ reiser4_panic(label, "assertion failed: %s", #cond); \
+ } \
+})
+
+/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
+#define check_me(label, expr) assert(label, (expr))
+
+#define ON_DEBUG(exp) exp
+
+extern int reiser4_schedulable(void);
+extern void call_on_each_assert(void);
+
+#else
+
+#define dinfo(format, args...) noop
+#define impossible(label, format, args...) noop
+#define assert(label, cond) noop
+#define check_me(label, expr) ((void) (expr))
+#define ON_DEBUG(exp)
+#define reiser4_schedulable() might_sleep()
+
+/* REISER4_DEBUG */
+#endif
+
+#if REISER4_DEBUG
+/* per-thread information about lock acquired by this thread. Used by lock
+ * ordering checking in spin_macros.h */
+typedef struct reiser4_lock_cnt_info {
+ int rw_locked_tree;
+ int read_locked_tree;
+ int write_locked_tree;
+
+ int rw_locked_dk;
+ int read_locked_dk;
+ int write_locked_dk;
+
+ int rw_locked_cbk_cache;
+ int read_locked_cbk_cache;
+ int write_locked_cbk_cache;
+
+ int spin_locked_zlock;
+ int spin_locked_jnode;
+ int spin_locked_jload;
+ int spin_locked_txnh;
+ int spin_locked_atom;
+ int spin_locked_stack;
+ int spin_locked_txnmgr;
+ int spin_locked_ktxnmgrd;
+ int spin_locked_fq;
+ int spin_locked_inode;
+ int spin_locked_super_eflush;
+ int spin_locked;
+ int long_term_locked_znode;
+
+ int inode_sem_r;
+ int inode_sem_w;
+
+ int d_refs;
+ int x_refs;
+ int t_refs;
+} reiser4_lock_cnt_info;
+
+extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
+#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
+
+/* increment lock-counter @counter, if present */
+#define LOCK_CNT_INC(counter) \
+ IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
+
+/* decrement lock-counter @counter, if present */
+#define LOCK_CNT_DEC(counter) \
+ IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
+
+/* check that lock-counter is zero. This is for use in assertions */
+#define LOCK_CNT_NIL(counter) \
+ IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
+
+/* check that lock-counter is greater than zero. This is for use in
+ * assertions */
+#define LOCK_CNT_GTZ(counter) \
+ IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
+#define LOCK_CNT_LT(counter,n) \
+ IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
+
+#else /* REISER4_DEBUG */
+
+/* no-op versions on the above */
+
+typedef struct reiser4_lock_cnt_info {
+} reiser4_lock_cnt_info;
+
+#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
+#define LOCK_CNT_INC(counter) noop
+#define LOCK_CNT_DEC(counter) noop
+#define LOCK_CNT_NIL(counter) (1)
+#define LOCK_CNT_GTZ(counter) (1)
+#define LOCK_CNT_LT(counter, n) (1)
+
+#endif /* REISER4_DEBUG */
+
+#define assert_spin_not_locked(lock) BUG_ON(0)
+#define assert_rw_write_locked(lock) BUG_ON(0)
+#define assert_rw_read_locked(lock) BUG_ON(0)
+#define assert_rw_locked(lock) BUG_ON(0)
+#define assert_rw_not_write_locked(lock) BUG_ON(0)
+#define assert_rw_not_read_locked(lock) BUG_ON(0)
+#define assert_rw_not_locked(lock) BUG_ON(0)
+
+/* flags controlling debugging behavior. Are set through debug_flags=N mount
+ option. */
+typedef enum {
+ /* print a lot of information during panic. When this is on all jnodes
+ * are listed. This can be *very* large output. Usually you don't want
+ * this. Especially over serial line. */
+ REISER4_VERBOSE_PANIC = 0x00000001,
+ /* print a lot of information during umount */
+ REISER4_VERBOSE_UMOUNT = 0x00000002,
+ /* print gathered statistics on umount */
+ REISER4_STATS_ON_UMOUNT = 0x00000004,
+ /* check node consistency */
+ REISER4_CHECK_NODE = 0x00000008
+} reiser4_debug_flags;
+
+extern int is_in_reiser4_context(void);
+
+/*
+ * evaluate expression @e only if with reiser4 context
+ */
+#define ON_CONTEXT(e) do { \
+ if (is_in_reiser4_context()) { \
+ e; \
+ } } while (0)
+
+/*
+ * evaluate expression @e only when within reiser4_context and debugging is
+ * on.
+ */
+#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e))
+
+/*
+ * complain about unexpected function result and crash. Used in "default"
+ * branches of switch statements and alike to assert that invalid results are
+ * not silently ignored.
+ */
+#define wrong_return_value(label, function) \
+ impossible(label, "wrong return value from " function)
+
+/* Issue different types of reiser4 messages to the console */
+#define warning(label, format, ...) \
+ DCALL(KERN_WARNING, \
+ printk, 1, label, "WARNING: " format , ## __VA_ARGS__)
+#define notice(label, format, ...) \
+ DCALL(KERN_NOTICE, \
+ printk, 1, label, "NOTICE: " format , ## __VA_ARGS__)
+
+/* mark not yet implemented functionality */
+#define not_yet(label, format, ...) \
+ reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__)
+
+extern void reiser4_do_panic(const char *format, ...)
+ __attribute__ ((noreturn, format(printf, 1, 2)));
+
+extern int reiser4_preempt_point(void);
+extern void reiser4_print_stats(void);
+
+#if REISER4_DEBUG
+extern int reiser4_no_counters_are_held(void);
+extern int reiser4_commit_check_locks(void);
+#else
+#define reiser4_no_counters_are_held() (1)
+#define reiser4_commit_check_locks() (1)
+#endif
+
+/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
+#define IS_POW(i) \
+({ \
+ typeof(i) __i; \
+ \
+ __i = (i); \
+ !(__i & (__i - 1)); \
+})
+
+#define KERNEL_DEBUGGER (1)
+
+#if KERNEL_DEBUGGER
+
+extern void reiser4_debugtrap(void);
+
+/*
+ * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
+ * kgdb is not compiled in, do nothing.
+ */
+#define DEBUGON(cond) \
+({ \
+ if (unlikely(cond)) \
+ reiser4_debugtrap(); \
+})
+#else
+#define DEBUGON(cond) noop
+#endif
+
+/*
+ * Error code tracing facility. (Idea is borrowed from XFS code.)
+ *
+ * Suppose some strange and/or unexpected code is returned from some function
+ * (for example, write(2) returns -EEXIST). It is possible to place a
+ * breakpoint in the reiser4_write(), but it is too late here. How to find out
+ * in what particular place -EEXIST was generated first?
+ *
+ * In reiser4 all places where actual error codes are produced (that is,
+ * statements of the form
+ *
+ * return -EFOO; // (1), or
+ *
+ * result = -EFOO; // (2)
+ *
+ * are replaced with
+ *
+ * return RETERR(-EFOO); // (1a), and
+ *
+ * result = RETERR(-EFOO); // (2a) respectively
+ *
+ * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
+ * printed in error and warning messages. Moreover, it's possible to put a
+ * conditional breakpoint in reiser4_return_err (low-level function called
+ * by RETERR() to do the actual work) to break into debugger immediately
+ * when particular error happens.
+ *
+ */
+
+#if REISER4_DEBUG
+
+/*
+ * data-type to store information about where error happened ("error site").
+ */
+typedef struct err_site {
+ int code; /* error code */
+ const char *file; /* source file, filled by __FILE__ */
+ int line; /* source file line, filled by __LINE__ */
+} err_site;
+
+extern void reiser4_return_err(int code, const char *file, int line);
+
+/*
+ * fill &get_current_context()->err_site with error information.
+ */
+#define RETERR(code) \
+({ \
+ typeof(code) __code; \
+ \
+ __code = (code); \
+ reiser4_return_err(__code, __FILE__, __LINE__); \
+ __code; \
+})
+
+#else
+
+/*
+ * no-op versions of the above
+ */
+
+typedef struct err_site {
+} err_site;
+#define RETERR(code) code
+#endif
+
+#if REISER4_LARGE_KEY
+/*
+ * conditionally compile arguments only if REISER4_LARGE_KEY is on.
+ */
+#define ON_LARGE_KEY(...) __VA_ARGS__
+#else
+#define ON_LARGE_KEY(...)
+#endif
+
+/* __FS_REISER4_DEBUG_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/dformat.h linux-5.10.2/fs/reiser4/dformat.h
--- linux-5.10.2.orig/fs/reiser4/dformat.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/dformat.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,124 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Formats of on-disk data and conversion functions. */
+
+/* put all item formats in the files describing the particular items,
+ our model is, everything you need to do to add an item to reiser4,
+ (excepting the changes to the plugin that uses the item which go
+ into the file defining that plugin), you put into one file. */
+/* Data on disk are stored in little-endian format.
+ To declare fields of on-disk structures, use d8, d16, d32 and d64.
+ d??tocpu() and cputod??() to convert. */
+
+#if !defined(__FS_REISER4_DFORMAT_H__)
+#define __FS_REISER4_DFORMAT_H__
+
+#include "debug.h"
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/types.h>
+
+typedef __u8 d8;
+typedef __le16 d16;
+typedef __le32 d32;
+typedef __le64 d64;
+
+#define PACKED __attribute__((packed))
+
+/* data-type for block number */
+typedef __u64 reiser4_block_nr;
+
+static_assert(sizeof(reiser4_block_nr) == 8);
+
+/* data-type for block number on disk, disk format */
+typedef __le64 reiser4_dblock_nr;
+
+/**
+ * disk_addr_eq - compare disk addresses
+ * @b1: pointer to block number ot compare
+ * @b2: pointer to block number ot compare
+ *
+ * Returns true if if disk addresses are the same
+ */
+static inline int disk_addr_eq(const reiser4_block_nr * b1,
+ const reiser4_block_nr * b2)
+{
+ assert("nikita-1033", b1 != NULL);
+ assert("nikita-1266", b2 != NULL);
+
+ return !memcmp(b1, b2, sizeof *b1);
+}
+
+/*
+ * Structure of master super block.
+ * Having been set by mkfs utility, master super block never
+ * get changed in its life long and doesn't participate in
+ * transactions.
+ */
+typedef struct reiser4_master_sb {
+ char magic[16]; /* "ReIsEr4" */
+ d16 dformat_pid; /* disk format plugin id (per subvolume) */
+ d16 blocksize; /* block size (per-volume) */
+ char uuid[16]; /* volume id (per volume) */
+ char label[16]; /* filesystem label (per volume) */
+ /* Reiser5 */
+ char sub_uuid[16]; /* subvolume's external id (per subolvume) */
+ d16 volume_pid; /* volume plugin id (per volume) */
+ d16 distrib_pid; /* distribution plugin id (per volume) */
+ d16 mirror_id; /* serial (ordered) number of the mirror
+ (0 for original subvolumes) */
+ d16 num_replicas; /* number of replicas of an original subvolume.
+ Original is a mirror with id=0, other mirrors
+ (if any) are called replicas */
+ char stripe_bits; /* logarithm of stripe size (per volume) */
+} reiser4_master_sb;
+
+static inline u16 master_get_block_size(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->blocksize));
+}
+
+static inline u16 master_get_dformat_pid(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->dformat_pid));
+}
+
+static inline u16 master_get_volume_pid(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->volume_pid));
+}
+
+static inline u16 master_get_distrib_pid(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->distrib_pid));
+}
+
+static inline u16 master_get_mirror_id(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->mirror_id));
+}
+
+static inline u16 master_get_num_replicas(reiser4_master_sb *master)
+{
+ return le16_to_cpu(get_unaligned(&master->num_replicas));
+}
+
+static inline char master_get_stripe_bits(reiser4_master_sb *master)
+{
+ return master->stripe_bits;
+}
+
+/* __FS_REISER4_DFORMAT_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/discard.c linux-5.10.2/fs/reiser4/discard.c
--- linux-5.10.2.orig/fs/reiser4/discard.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/discard.c 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,182 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* TRIM/discard interoperation subsystem for reiser4. */
+
+/*
+ * This subsystem is responsible for populating an atom's ->discard_set and
+ * (later) converting it into a series of discard calls to the kernel.
+ *
+ * The discard is an in-kernel interface for notifying the storage
+ * hardware about blocks that are being logically freed by the filesystem.
+ * This is done via calling the blkdev_issue_discard() function. There are
+ * restrictions on block ranges: they should constitute at least one erase unit
+ * in length and be correspondingly aligned. Otherwise a discard request will
+ * be ignored.
+ *
+ * The erase unit size is kept in struct queue_limits as discard_granularity.
+ * The offset from the partition start to the first erase unit is kept in
+ * struct queue_limits as discard_alignment.
+ *
+ * At atom level, we record numbers of all blocks that happen to be deallocated
+ * during the transaction. Then we read the generated set, filter out any blocks
+ * that have since been allocated again and issue discards for everything still
+ * valid. This is what discard.[ch] is here for.
+ *
+ * However, simply iterating through the recorded extents is not enough:
+ * - if a single extent is smaller than the erase unit, then this particular
+ * extent won't be discarded even if it is surrounded by enough free blocks
+ * to constitute a whole erase unit;
+ * - we won't be able to merge small adjacent extents forming an extent long
+ * enough to be discarded.
+ *
+ * MECHANISM:
+ *
+ * During the transaction deallocated extents are recorded in atom's delete
+ * set. In reiser4, there are two methods to deallocate a block:
+ * 1. deferred deallocation, enabled by BA_DEFER flag to reiser4_dealloc_block().
+ * In this mode, blocks are stored to delete set instead of being marked free
+ * immediately. After committing the transaction, the delete set is "applied"
+ * by the block allocator and all these blocks are marked free in memory
+ * (see reiser4_post_write_back_hook()).
+ * Space management plugins also read the delete set to update on-disk
+ * allocation records (see reiser4_pre_commit_hook()).
+ * 2. immediate deallocation (the opposite).
+ * In this mode, blocks are marked free immediately. This is used by the
+ * journal subsystem to manage space used by the journal records, so these
+ * allocations are not visible to the space management plugins and never hit
+ * the disk.
+ *
+ * When discard is enabled, all immediate deallocations become deferred. This
+ * is OK because journal's allocations happen after reiser4_pre_commit_hook()
+ * where the on-disk space allocation records are updated. So, in this mode
+ * the atom's delete set becomes "the discard set" -- list of blocks that have
+ * to be considered for discarding.
+ *
+ * Discarding is performed before completing deferred deallocations, hence all
+ * extents in the discard set are still marked as allocated and cannot contain
+ * any data. Thus we can avoid any checks for blocks directly present in the
+ * discard set.
+ *
+ * For now, we don't perform "padding" of extents to erase unit boundaries.
+ * This means if extents are not aligned with the device's erase unit lattice,
+ * the partial erase units at head and tail of extents are truncated by kernel
+ * (in blkdev_issue_discard()).
+ *
+ * So, at commit time the following actions take place:
+ * - delete sets are merged to form the discard set;
+ * - elements of the discard set are sorted;
+ * - the discard set is iterated, joining any adjacent extents;
+ * - for each extent, a single call to blkdev_issue_discard() is done.
+ */
+
+#include "discard.h"
+#include "context.h"
+#include "debug.h"
+#include "txnmgr.h"
+#include "super.h"
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+static int __discard_extent(struct block_device *bdev, sector_t start,
+ sector_t len)
+{
+ assert("intelfx-21", bdev != NULL);
+
+ return blkdev_issue_discard(bdev, start, len, reiser4_ctx_gfp_mask_get(),
+ 0);
+}
+
+static int discard_extent(txn_atom *atom UNUSED_ARG,
+ const reiser4_block_nr* start,
+ const reiser4_block_nr* len,
+ __u32 subvol_id,
+ void *data UNUSED_ARG)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+ reiser4_subvol *subv = super_origin(sb, subvol_id);
+ struct block_device *bdev = subv->bdev;
+ sector_t extent_start_sec, extent_len_sec;
+ const int sec_per_blk = sb->s_blocksize >> 9;
+
+ if (!subvol_is_set(subv, SUBVOL_IS_NONROT_DEVICE))
+ return 0;
+
+ /* we assume block = N * sector */
+ assert("intelfx-7", sec_per_blk > 0);
+
+ /* convert extent to sectors */
+ extent_start_sec = *start * sec_per_blk;
+ extent_len_sec = *len * sec_per_blk;
+
+ /* discard the extent, don't pad it to erase unit boundaries for now */
+ return __discard_extent(bdev, extent_start_sec, extent_len_sec);
+}
+
+int discard_atom(txn_atom *atom, struct list_head *processed_set)
+{
+ int ret;
+ struct list_head discard_set;
+
+ if (!reiser4_is_set(reiser4_get_current_sb(), REISER4_DISCARD)) {
+ spin_unlock_atom(atom);
+ return 0;
+ }
+
+ assert("intelfx-28", atom != NULL);
+ assert("intelfx-59", processed_set != NULL);
+
+ if (list_empty(&atom->discard.delete_set)) {
+ /* Nothing left to discard. */
+ spin_unlock_atom(atom);
+ return 0;
+ }
+
+ /* Take the delete sets from the atom in order to release atom spinlock. */
+ blocknr_list_init(&discard_set);
+ blocknr_list_merge(&atom->discard.delete_set, &discard_set);
+ spin_unlock_atom(atom);
+
+ /* Sort the discard list, joining adjacent and overlapping extents. */
+ blocknr_list_sort_and_join(&discard_set);
+
+ /* Perform actual dirty work. */
+ ret = blocknr_list_iterator(NULL, &discard_set, &discard_extent, NULL, 0);
+
+ /* Add processed extents to the temporary list. */
+ blocknr_list_merge(&discard_set, processed_set);
+
+ if (ret != 0) {
+ return ret;
+ }
+
+ /* Let's do this again for any new extents in the atom's discard set. */
+ return -E_REPEAT;
+}
+
+void discard_atom_post(txn_atom *atom, struct list_head *processed_set)
+{
+ assert("intelfx-60", atom != NULL);
+ assert("intelfx-61", processed_set != NULL);
+
+ if (!reiser4_is_set(reiser4_get_current_sb(), REISER4_DISCARD)) {
+ spin_unlock_atom(atom);
+ return;
+ }
+
+ blocknr_list_merge(processed_set, &atom->discard.delete_set);
+ spin_unlock_atom(atom);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/discard.h linux-5.10.2/fs/reiser4/discard.h
--- linux-5.10.2.orig/fs/reiser4/discard.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/discard.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,42 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* TRIM/discard interoperation subsystem for reiser4. */
+
+#if !defined(__FS_REISER4_DISCARD_H__)
+#define __FS_REISER4_DISCARD_H__
+
+#include "forward.h"
+#include "dformat.h"
+
+/**
+ * Issue discard requests for all block extents recorded in @atom's delete sets,
+ * if discard is enabled. The extents processed are removed from the @atom's
+ * delete sets and stored in @processed_set.
+ *
+ * @atom must be locked on entry and is unlocked on exit.
+ * @processed_set must be initialized with blocknr_list_init().
+ */
+extern int discard_atom(txn_atom *atom, struct list_head *processed_set);
+
+/**
+ * Splices @processed_set back to @atom's delete set.
+ * Must be called after discard_atom() loop, using the same @processed_set.
+ *
+ * @atom must be locked on entry and is unlocked on exit.
+ * @processed_set must be the same as passed to discard_atom().
+ */
+extern void discard_atom_post(txn_atom *atom, struct list_head *processed_set);
+
+/* __FS_REISER4_DISCARD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/dscale.c linux-5.10.2/fs/reiser4/dscale.c
--- linux-5.10.2.orig/fs/reiser4/dscale.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/dscale.c 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,192 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers */
+
+/*
+ * Various on-disk structures contain integer-like structures. Stat-data
+ * contain [yes, "data" is plural, check the dictionary] file size, link
+ * count; extent unit contains extent width etc. To accommodate for general
+ * case enough space is reserved to keep largest possible value. 64 bits in
+ * all cases above. But in overwhelming majority of cases numbers actually
+ * stored in these fields will be comparatively small and reserving 8 bytes is
+ * a waste of precious disk bandwidth.
+ *
+ * Scalable integers are one way to solve this problem. dscale_write()
+ * function stores __u64 value in the given area consuming from 1 to 9 bytes,
+ * depending on the magnitude of the value supplied. dscale_read() reads value
+ * previously stored by dscale_write().
+ *
+ * dscale_write() produces format not completely unlike of UTF: two highest
+ * bits of the first byte are used to store "tag". One of 4 possible tag
+ * values is chosen depending on the number being encoded:
+ *
+ * 0 ... 0x3f => 0 [table 1]
+ * 0x40 ... 0x3fff => 1
+ * 0x4000 ... 0x3fffffff => 2
+ * 0x40000000 ... 0xffffffffffffffff => 3
+ *
+ * (see dscale_range() function)
+ *
+ * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
+ * to be stored, so in this case there is no place in the first byte to store
+ * tag. For such values tag is stored in an extra 9th byte.
+ *
+ * As _highest_ bits are used for the test (which is natural) scaled integers
+ * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
+ * uses LITTLE-ENDIAN.
+ *
+ */
+
+#include "debug.h"
+#include "dscale.h"
+
+/* return tag of scaled integer stored at @address */
+static int gettag(const unsigned char *address)
+{
+ /* tag is stored in two highest bits */
+ return (*address) >> 6;
+}
+
+/* clear tag from value. Clear tag embedded into @value. */
+static void cleartag(__u64 *value, int tag)
+{
+ /*
+ * W-w-what ?!
+ *
+ * Actually, this is rather simple: @value passed here was read by
+ * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
+ * zeroes. Tag is still stored in the highest (arithmetically)
+ * non-zero bits of @value, but relative position of tag within __u64
+ * depends on @tag.
+ *
+ * For example if @tag is 0, it's stored 2 highest bits of lowest
+ * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
+ *
+ * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
+ * and it's offset if (2 * 8) - 2 == 14 bits.
+ *
+ * See table 1 above for details.
+ *
+ * All these cases are captured by the formula:
+ */
+ *value &= ~(3 << (((1 << tag) << 3) - 2));
+ /*
+ * That is, clear two (3 == 0t11) bits at the offset
+ *
+ * 8 * (2 ^ tag) - 2,
+ *
+ * that is, two highest bits of (2 ^ tag)-th byte of @value.
+ */
+}
+
+/* return tag for @value. See table 1 above for details. */
+static int dscale_range(__u64 value)
+{
+ if (value > 0x3fffffff)
+ return 3;
+ if (value > 0x3fff)
+ return 2;
+ if (value > 0x3f)
+ return 1;
+ return 0;
+}
+
+/* restore value stored at @adderss by dscale_write() and return number of
+ * bytes consumed */
+int dscale_read(unsigned char *address, __u64 *value)
+{
+ int tag;
+
+ /* read tag */
+ tag = gettag(address);
+ switch (tag) {
+ case 3:
+ /* In this case tag is stored in an extra byte, skip this byte
+ * and decode value stored in the next 8 bytes.*/
+ *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
+ /* worst case: 8 bytes for value itself plus one byte for
+ * tag. */
+ return 9;
+ case 0:
+ *value = get_unaligned(address);
+ break;
+ case 1:
+ *value = __be16_to_cpu(get_unaligned((__be16 *)address));
+ break;
+ case 2:
+ *value = __be32_to_cpu(get_unaligned((__be32 *)address));
+ break;
+ default:
+ return RETERR(-EIO);
+ }
+ /* clear tag embedded into @value */
+ cleartag(value, tag);
+ /* number of bytes consumed is (2 ^ tag)---see table 1. */
+ return 1 << tag;
+}
+
+/* number of bytes consumed */
+int dscale_bytes_to_read(unsigned char *address)
+{
+ int tag;
+
+ tag = gettag(address);
+ switch (tag) {
+ case 0:
+ case 1:
+ case 2:
+ return 1 << tag;
+ case 3:
+ return 9;
+ default:
+ return RETERR(-EIO);
+ }
+}
+
+/* store @value at @address and return number of bytes consumed */
+int dscale_write(unsigned char *address, __u64 value)
+{
+ int tag;
+ int shift;
+ __be64 v;
+ unsigned char *valarr;
+
+ tag = dscale_range(value);
+ v = __cpu_to_be64(value);
+ valarr = (unsigned char *)&v;
+ shift = (tag == 3) ? 1 : 0;
+ memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
+ *address |= (tag << 6);
+ return shift + (1 << tag);
+}
+
+/* number of bytes required to store @value */
+int dscale_bytes_to_write(__u64 value)
+{
+ int bytes;
+
+ bytes = 1 << dscale_range(value);
+ if (bytes == 8)
+ ++bytes;
+ return bytes;
+}
+
+/* returns true if @value and @other require the same number of bytes to be
+ * stored. Used by detect when data structure (like stat-data) has to be
+ * expanded or contracted. */
+int dscale_fit(__u64 value, __u64 other)
+{
+ return dscale_range(value) == dscale_range(other);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/dscale.h linux-5.10.2/fs/reiser4/dscale.h
--- linux-5.10.2.orig/fs/reiser4/dscale.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/dscale.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,28 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Scalable on-disk integers. See dscale.h for details. */
+
+#if !defined(__FS_REISER4_DSCALE_H__)
+#define __FS_REISER4_DSCALE_H__
+
+#include "dformat.h"
+
+extern int dscale_read(unsigned char *address, __u64 *value);
+extern int dscale_write(unsigned char *address, __u64 value);
+extern int dscale_bytes_to_read(unsigned char *address);
+extern int dscale_bytes_to_write(__u64 value);
+extern int dscale_fit(__u64 value, __u64 other);
+
+/* __FS_REISER4_DSCALE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/entd.c linux-5.10.2/fs/reiser4/entd.c
--- linux-5.10.2.orig/fs/reiser4/entd.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/entd.c 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,361 @@
+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Ent daemon. */
+
+#include "debug.h"
+#include "txnmgr.h"
+#include "tree.h"
+#include "entd.h"
+#include "super.h"
+#include "context.h"
+#include "reiser4.h"
+#include "vfs_ops.h"
+#include "page_cache.h"
+#include "inode.h"
+
+#include <linux/sched.h> /* struct task_struct */
+#include <linux/suspend.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/time.h> /* INITIAL_JIFFIES */
+#include <linux/backing-dev.h> /* bdi_write_congested */
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DEF_PRIORITY 12
+#define MAX_ENTD_ITERS 10
+
+static void entd_flush(struct super_block *, struct wbq *);
+static int entd(void *arg);
+
+/*
+ * set ->comm field of end thread to make its state visible to the user level
+ */
+#define entd_set_comm(state) \
+ snprintf(current->comm, sizeof(current->comm), \
+ "ent:%s%s", super->s_id, (state))
+
+/**
+ * reiser4_init_entd - initialize entd context and start kernel daemon
+ * @super: super block to start ent thread for
+ *
+ * Creates entd contexts, starts kernel thread and waits until it
+ * initializes.
+ */
+int reiser4_init_entd(struct super_block *super)
+{
+ entd_context *ctx;
+
+ assert("nikita-3104", super != NULL);
+
+ ctx = get_entd_context(super);
+
+ memset(ctx, 0, sizeof *ctx);
+ spin_lock_init(&ctx->guard);
+ init_waitqueue_head(&ctx->wait);
+#if REISER4_DEBUG
+ INIT_LIST_HEAD(&ctx->flushers_list);
+#endif
+ /* lists of writepage requests */
+ INIT_LIST_HEAD(&ctx->todo_list);
+ INIT_LIST_HEAD(&ctx->done_list);
+ /* start entd */
+ ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
+ if (IS_ERR(ctx->tsk))
+ return PTR_ERR(ctx->tsk);
+ return 0;
+}
+
+static void put_wbq(struct wbq *rq)
+{
+ iput(rq->mapping->host);
+ complete(&rq->completion);
+}
+
+/* ent should be locked */
+static struct wbq *__get_wbq(entd_context * ent)
+{
+ struct wbq *wbq;
+
+ if (list_empty(&ent->todo_list))
+ return NULL;
+
+ ent->nr_todo_reqs--;
+ wbq = list_entry(ent->todo_list.next, struct wbq, link);
+ list_del_init(&wbq->link);
+ return wbq;
+}
+
+/* ent thread function */
+static int entd(void *arg)
+{
+ struct super_block *super;
+ entd_context *ent;
+ int done = 0;
+
+ super = arg;
+ /* do_fork() just copies task_struct into the new
+ thread. ->fs_context shouldn't be copied of course. This shouldn't
+ be a problem for the rest of the code though.
+ */
+ current->journal_info = NULL;
+
+ ent = get_entd_context(super);
+
+ while (!done) {
+ try_to_freeze();
+
+ spin_lock(&ent->guard);
+ while (ent->nr_todo_reqs != 0) {
+ struct wbq *rq;
+
+ assert("", list_empty(&ent->done_list));
+
+ /* take request from the queue head */
+ rq = __get_wbq(ent);
+ assert("", rq != NULL);
+ ent->cur_request = rq;
+ spin_unlock(&ent->guard);
+
+ entd_set_comm("!");
+ entd_flush(super, rq);
+
+ put_wbq(rq);
+
+ /*
+ * wakeup all requestors and iput their inodes
+ */
+ spin_lock(&ent->guard);
+ while (!list_empty(&ent->done_list)) {
+ rq = list_entry(ent->done_list.next, struct wbq, link);
+ list_del_init(&rq->link);
+ ent->nr_done_reqs--;
+ spin_unlock(&ent->guard);
+ assert("", rq->written == 1);
+ put_wbq(rq);
+ spin_lock(&ent->guard);
+ }
+ }
+ spin_unlock(&ent->guard);
+
+ entd_set_comm(".");
+
+ {
+ DEFINE_WAIT(__wait);
+
+ do {
+ prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
+ if (kthread_should_stop()) {
+ done = 1;
+ break;
+ }
+ if (ent->nr_todo_reqs != 0)
+ break;
+ schedule();
+ } while (0);
+ finish_wait(&ent->wait, &__wait);
+ }
+ }
+ BUG_ON(ent->nr_todo_reqs != 0);
+ return 0;
+}
+
+/**
+ * reiser4_done_entd - stop entd kernel thread
+ * @super: super block to stop ent thread for
+ *
+ * It is called on umount. Sends stop signal to entd and wait until it handles
+ * it.
+ */
+void reiser4_done_entd(struct super_block *super)
+{
+ entd_context *ent;
+
+ assert("nikita-3103", super != NULL);
+
+ ent = get_entd_context(super);
+ assert("zam-1055", ent->tsk != NULL);
+ kthread_stop(ent->tsk);
+}
+
+/* called at the beginning of jnode_flush to register flusher thread with ent
+ * daemon */
+void reiser4_enter_flush(struct super_block *super)
+{
+ entd_context *ent;
+
+ assert("zam-1029", super != NULL);
+ ent = get_entd_context(super);
+
+ assert("zam-1030", ent != NULL);
+
+ spin_lock(&ent->guard);
+ ent->flushers++;
+#if REISER4_DEBUG
+ list_add(&get_current_context()->flushers_link, &ent->flushers_list);
+#endif
+ spin_unlock(&ent->guard);
+}
+
+/* called at the end of jnode_flush */
+void reiser4_leave_flush(struct super_block *super)
+{
+ entd_context *ent;
+ int wake_up_ent;
+
+ assert("zam-1027", super != NULL);
+ ent = get_entd_context(super);
+
+ assert("zam-1028", ent != NULL);
+
+ spin_lock(&ent->guard);
+ ent->flushers--;
+ wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
+#if REISER4_DEBUG
+ list_del_init(&get_current_context()->flushers_link);
+#endif
+ spin_unlock(&ent->guard);
+ if (wake_up_ent)
+ wake_up_process(ent->tsk);
+}
+
+#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
+
+static void entd_flush(struct super_block *super, struct wbq *rq)
+{
+ reiser4_context ctx;
+
+ init_stack_context(&ctx, super);
+ ctx.entd = 1;
+ ctx.gfp_mask = GFP_NOFS;
+
+ rq->wbc->range_start = page_offset(rq->page);
+ rq->wbc->range_end = rq->wbc->range_start +
+ (ENTD_CAPTURE_APAGE_BURST << PAGE_SHIFT);
+
+
+ rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
+
+ if (rq->wbc->nr_to_write > 0) {
+ long result;
+ struct bdi_writeback *wb;
+ struct wb_writeback_work work = {
+ .sb = super,
+ .sync_mode = WB_SYNC_NONE,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ .reason = WB_REASON_VMSCAN,
+ };
+ rq->wbc->sync_mode = work.sync_mode,
+ rq->wbc->range_cyclic = work.range_cyclic,
+ rq->wbc->range_start = 0;
+ rq->wbc->range_end = LLONG_MAX;
+ /*
+ * we don't need to pin superblock for writeback:
+ * this is implicitly pinned by write_page_by_ent
+ * (via igrab), so that shutdown_super() will wait
+ * (on reiser4_put_super) for entd completion.
+ */
+ wb = &inode_to_bdi(rq->mapping->host)->wb;
+
+ spin_lock(&wb->list_lock);
+ result = generic_writeback_sb_inodes(super,
+ wb,
+ rq->wbc,
+ &work,
+ true);
+ spin_unlock(&wb->list_lock);
+ }
+ rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
+
+ reiser4_writeout(super, rq->wbc);
+ context_set_commit_async(&ctx);
+ reiser4_exit_context(&ctx);
+}
+
+/**
+ * write_page_by_ent - ask entd thread to flush this page as part of slum
+ * @page: page to be written
+ * @wbc: writeback control passed to reiser4_writepage
+ *
+ * Creates a request, puts it on entd list of requests, wakeups entd if
+ * necessary, waits until entd completes with the request.
+ */
+int write_page_by_ent(struct page *page, struct writeback_control *wbc)
+{
+ struct super_block *sb;
+ struct inode *inode;
+ entd_context *ent;
+ struct wbq rq;
+
+ assert("", PageLocked(page));
+ assert("", page->mapping != NULL);
+
+ sb = page->mapping->host->i_sb;
+ ent = get_entd_context(sb);
+ assert("", ent && ent->done == 0);
+
+ /*
+ * we are going to unlock page and ask ent thread to write the
+ * page. Re-dirty page before unlocking so that if ent thread fails to
+ * write it - it will remain dirty
+ */
+ set_page_dirty_notag(page);
+ account_page_redirty(page);
+
+ /*
+ * pin inode in memory, unlock page, entd_flush will iput. We can not
+ * iput here becasue we can not allow delete_inode to be called here
+ */
+ inode = igrab(page->mapping->host);
+ unlock_page(page);
+ if (inode == NULL)
+ /* inode is getting freed */
+ return 0;
+
+ /* init wbq */
+ INIT_LIST_HEAD(&rq.link);
+ rq.magic = WBQ_MAGIC;
+ rq.wbc = wbc;
+ rq.page = page;
+ rq.mapping = inode->i_mapping;
+ rq.node = NULL;
+ rq.written = 0;
+ init_completion(&rq.completion);
+
+ /* add request to entd's list of writepage requests */
+ spin_lock(&ent->guard);
+ ent->nr_todo_reqs++;
+ list_add_tail(&rq.link, &ent->todo_list);
+ if (ent->nr_todo_reqs == 1)
+ wake_up_process(ent->tsk);
+
+ spin_unlock(&ent->guard);
+
+ /* wait until entd finishes */
+ wait_for_completion(&rq.completion);
+
+ if (rq.written)
+ /* Eventually ENTD has written the page to disk. */
+ return 0;
+ return 0;
+}
+
+int wbq_available(void)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+ entd_context *ent = get_entd_context(sb);
+ return ent->nr_todo_reqs;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/entd.h linux-5.10.2/fs/reiser4/entd.h
--- linux-5.10.2.orig/fs/reiser4/entd.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/entd.h 2020-12-23 16:07:46.115813085 +0100
@@ -0,0 +1,90 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Ent daemon. */
+
+#ifndef __ENTD_H__
+#define __ENTD_H__
+
+#include "context.h"
+
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h> /* for struct task_struct */
+
+#define WBQ_MAGIC 0x7876dc76
+
+/* write-back request. */
+struct wbq {
+ int magic;
+ struct list_head link; /* list head of this list is in entd context */
+ struct writeback_control *wbc;
+ struct page *page;
+ struct address_space *mapping;
+ struct completion completion;
+ jnode *node; /* set if ent thread captured requested page */
+ int written; /* set if ent thread wrote requested page */
+};
+
+/* ent-thread context. This is used to synchronize starting/stopping ent
+ * threads. */
+typedef struct entd_context {
+ /* wait queue that ent thread waits on for more work. It's
+ * signaled by write_page_by_ent(). */
+ wait_queue_head_t wait;
+ /* spinlock protecting other fields */
+ spinlock_t guard;
+ /* ent thread */
+ struct task_struct *tsk;
+ /* set to indicate that ent thread should leave. */
+ int done;
+ /* counter of active flushers */
+ int flushers;
+ /*
+ * when reiser4_writepage asks entd to write a page - it adds struct
+ * wbq to this list
+ */
+ struct list_head todo_list;
+ /* number of elements on the above list */
+ int nr_todo_reqs;
+
+ struct wbq *cur_request;
+ /*
+ * when entd writes a page it moves write-back request from todo_list
+ * to done_list. This list is used at the end of entd iteration to
+ * wakeup requestors and iput inodes.
+ */
+ struct list_head done_list;
+ /* number of elements on the above list */
+ int nr_done_reqs;
+
+#if REISER4_DEBUG
+ /* list of all active flushers */
+ struct list_head flushers_list;
+#endif
+} entd_context;
+
+extern int reiser4_init_entd(struct super_block *);
+extern void reiser4_done_entd(struct super_block *);
+
+extern void reiser4_enter_flush(struct super_block *);
+extern void reiser4_leave_flush(struct super_block *);
+
+extern int write_page_by_ent(struct page *, struct writeback_control *);
+extern int wbq_available(void);
+extern void ent_writes_page(struct super_block *, struct page *);
+
+extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
+/* __ENTD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/eottl.c linux-5.10.2/fs/reiser4/eottl.c
--- linux-5.10.2.orig/fs/reiser4/eottl.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/eottl.c 2020-12-23 16:07:46.116813099 +0100
@@ -0,0 +1,514 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "tree_mod.h"
+#include "carry.h"
+#include "tree.h"
+#include "super.h"
+
+#include <linux/types.h> /* for __u?? */
+
+/*
+ * Extents on the twig level (EOTTL) handling.
+ *
+ * EOTTL poses some problems to the tree traversal, that are better explained
+ * by example.
+ *
+ * Suppose we have block B1 on the twig level with the following items:
+ *
+ * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
+ * offset)
+ * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
+ * 2. internal item I2 with key (10:0:0:0)
+ *
+ * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
+ * then intra-node lookup is done. This lookup finished on the E1, because the
+ * key we are looking for is larger than the key of E1 and is smaller than key
+ * the of I2.
+ *
+ * Here search is stuck.
+ *
+ * After some thought it is clear what is wrong here: extents on the twig level
+ * break some basic property of the *search* tree (on the pretext, that they
+ * restore property of balanced tree).
+ *
+ * Said property is the following: if in the internal node of the search tree
+ * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
+ * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
+ * through the Pointer.
+ *
+ * This is not true, when Pointer is Extent-Pointer, simply because extent
+ * cannot expand indefinitely to the right to include any item with
+ *
+ * Key1 <= Key <= Key2.
+ *
+ * For example, our E1 extent is only responsible for the data with keys
+ *
+ * (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
+ *
+ * so, key range
+ *
+ * ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
+ *
+ * is orphaned: there is no way to get there from the tree root.
+ *
+ * In other words, extent pointers are different than normal child pointers as
+ * far as search tree is concerned, and this creates such problems.
+ *
+ * Possible solution for this problem is to insert our item into node pointed
+ * to by I2. There are some problems through:
+ *
+ * (1) I2 can be in a different node.
+ * (2) E1 can be immediately followed by another extent E2.
+ *
+ * (1) is solved by calling reiser4_get_right_neighbor() and accounting
+ * for locks/coords as necessary.
+ *
+ * (2) is more complex. Solution here is to insert new empty leaf node and
+ * insert internal item between E1 and E2 pointing to said leaf node. This is
+ * further complicated by possibility that E2 is in a different node, etc.
+ *
+ * Problems:
+ *
+ * (1) if there was internal item I2 immediately on the right of an extent E1
+ * we and we decided to insert new item S1 into node N2 pointed to by I2, then
+ * key of S1 will be less than smallest key in the N2. Normally, search key
+ * checks that key we are looking for is in the range of keys covered by the
+ * node key is being looked in. To work around of this situation, while
+ * preserving useful consistency check new flag CBK_TRUST_DK was added to the
+ * cbk falgs bitmask. This flag is automatically set on entrance to the
+ * coord_by_key() and is only cleared when we are about to enter situation
+ * described above.
+ *
+ * (2) If extent E1 is immediately followed by another extent E2 and we are
+ * searching for the key that is between E1 and E2 we only have to insert new
+ * empty leaf node when coord_by_key was called for insertion, rather than just
+ * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
+ * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
+ * performed by insert_by_key() and friends.
+ *
+ * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
+ * case it requires modification of node content which is only possible under
+ * write lock. It may well happen that we only have read lock on the node where
+ * new internal pointer is to be inserted (common case: lookup of non-existent
+ * stat-data that fells between two extents). If only read lock is held, tree
+ * traversal is restarted with lock_level modified so that next time we hit
+ * this problem, write lock will be held. Once we have write lock, balancing
+ * will be performed.
+ */
+
+/**
+ * is_next_item_internal - check whether next item is internal
+ * @coord: coordinate of extent item in twig node
+ * @key: search key
+ * @lh: twig node lock handle
+ *
+ * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
+ * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
+ * to that node, @coord is set to its first unit. If next item is not internal
+ * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
+ * is returned if search restart has to be done.
+ */
+static int
+is_next_item_internal(coord_t *coord, const reiser4_key * key,
+ lock_handle * lh)
+{
+ coord_t next;
+ lock_handle rn;
+ int result;
+ reiser4_tree *tree = znode_get_tree(coord->node);
+
+ coord_dup(&next, coord);
+ if (coord_next_unit(&next) == 0) {
+ /* next unit is in this node */
+ if (item_is_internal(&next)) {
+ coord_dup(coord, &next);
+ return 1;
+ }
+ assert("vs-3", item_is_extent(&next));
+ return 0;
+ }
+
+ /*
+ * next unit either does not exist or is in right neighbor. If it is in
+ * right neighbor we have to check right delimiting key because
+ * concurrent thread could get their first and insert item with a key
+ * smaller than @key
+ */
+ read_lock_dk(tree);
+ result = keycmp(key, znode_get_rd_key(coord->node));
+ read_unlock_dk(tree);
+ assert("vs-6", result != EQUAL_TO);
+ if (result == GREATER_THAN)
+ return 2;
+
+ /* lock right neighbor */
+ init_lh(&rn);
+ result = reiser4_get_right_neighbor(&rn, coord->node,
+ znode_is_wlocked(coord->node) ?
+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result == -E_NO_NEIGHBOR) {
+ /* we are on the rightmost edge of the tree */
+ done_lh(&rn);
+ return 0;
+ }
+
+ if (result) {
+ assert("vs-4", result < 0);
+ done_lh(&rn);
+ return result;
+ }
+
+ /*
+ * check whether concurrent thread managed to insert item with a key
+ * smaller than @key
+ */
+ read_lock_dk(tree);
+ result = keycmp(key, znode_get_ld_key(rn.node));
+ read_unlock_dk(tree);
+ assert("vs-6", result != EQUAL_TO);
+ if (result == GREATER_THAN) {
+ done_lh(&rn);
+ return 2;
+ }
+
+ result = zload(rn.node);
+ if (result) {
+ assert("vs-5", result < 0);
+ done_lh(&rn);
+ return result;
+ }
+
+ coord_init_first_unit(&next, rn.node);
+ if (item_is_internal(&next)) {
+ /*
+ * next unit is in right neighbor and it is an unit of internal
+ * item. Unlock coord->node. Move @lh to right neighbor. @coord
+ * is set to the first unit of right neighbor.
+ */
+ coord_dup(coord, &next);
+ zrelse(rn.node);
+ done_lh(lh);
+ move_lh(lh, &rn);
+ return 1;
+ }
+
+ /*
+ * next unit is unit of extent item. Return without chaning @lh and
+ * @coord.
+ */
+ assert("vs-6", item_is_extent(&next));
+ zrelse(rn.node);
+ done_lh(&rn);
+ return 0;
+}
+
+/**
+ * rd_key - calculate key of an item next to the given one
+ * @coord: position in a node
+ * @key: storage for result key
+ *
+ * @coord is set between items or after the last item in a node. Calculate key
+ * of item to the right of @coord.
+ */
+static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
+{
+ coord_t dup;
+ reiser4_tree *tree;
+
+ assert("nikita-2281", coord_is_between_items(coord));
+
+ tree = znode_get_tree(coord->node);
+ coord_dup(&dup, coord);
+
+ if (coord_set_to_right(&dup) == 0)
+ /* next item is in this node. Return its key. */
+ unit_key_by_coord(&dup, key);
+ else {
+ /*
+ * next item either does not exist or is in right
+ * neighbor. Return znode's right delimiting key.
+ */
+ read_lock_dk(tree);
+ *key = *znode_get_rd_key(coord->node);
+ read_unlock_dk(tree);
+ }
+ return key;
+}
+
+/**
+ * add_empty_leaf - insert empty leaf between two extents
+ * @insert_coord: position in twig node between two extents
+ * @lh: twig node lock handle
+ * @key: left delimiting key of new node
+ * @rdkey: right delimiting key of new node
+ *
+ * Inserts empty leaf node between two extent items. It is necessary when we
+ * have to insert an item on leaf level between two extents (items on the twig
+ * level).
+ */
+static int
+add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
+ const reiser4_key *key, const reiser4_key *rdkey)
+{
+ int result;
+ carry_pool *pool;
+ carry_level *todo;
+ reiser4_item_data *item;
+ carry_insert_data *cdata;
+ carry_op *op;
+ znode *node;
+ reiser4_tree *tree;
+
+ assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
+ tree = znode_get_tree(insert_coord->node);
+ node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+
+ /* setup delimiting keys for node being inserted */
+ write_lock_dk(tree);
+ znode_set_ld_key(node, key);
+ znode_set_rd_key(node, rdkey);
+ ON_DEBUG(node->creator = current);
+ ON_DEBUG(node->first_key = *key);
+ write_unlock_dk(tree);
+
+ ZF_SET(node, JNODE_ORPHAN);
+
+ /*
+ * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
+ * carry_insert_data
+ */
+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
+ sizeof(*item) + sizeof(*cdata));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ todo = (carry_level *) (pool + 1);
+ init_carry_level(todo, pool);
+
+ item = (reiser4_item_data *) (todo + 3);
+ cdata = (carry_insert_data *) (item + 1);
+
+ op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
+ if (!IS_ERR(op)) {
+ cdata->coord = insert_coord;
+ cdata->key = key;
+ cdata->data = item;
+ op->u.insert.d = cdata;
+ op->u.insert.type = COPT_ITEM_DATA;
+ build_child_ptr_data(node, item);
+ item->arg = NULL;
+ /* have @insert_coord to be set at inserted item after
+ insertion is done */
+ todo->track_type = CARRY_TRACK_CHANGE;
+ todo->tracked = lh;
+
+ result = reiser4_carry(todo, NULL);
+ if (result == 0) {
+ /*
+ * pin node in memory. This is necessary for
+ * znode_make_dirty() below.
+ */
+ result = zload(node);
+ if (result == 0) {
+ lock_handle local_lh;
+
+ /*
+ * if we inserted new child into tree we have
+ * to mark it dirty so that flush will be able
+ * to process it.
+ */
+ init_lh(&local_lh);
+ result = longterm_lock_znode(&local_lh, node,
+ ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_LOPRI);
+ if (result == 0) {
+ znode_make_dirty(node);
+
+ /*
+ * when internal item pointing to @node
+ * was inserted into twig node
+ * create_hook_internal did not connect
+ * it properly because its right
+ * neighbor was not known. Do it
+ * here
+ */
+ write_lock_tree();
+ assert("nikita-3312",
+ znode_is_right_connected(node));
+ assert("nikita-2984",
+ node->right == NULL);
+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
+ write_unlock_tree();
+ result =
+ connect_znode(insert_coord, node);
+ ON_DEBUG(if (result == 0) check_dkeys(node););
+
+ done_lh(lh);
+ move_lh(lh, &local_lh);
+ assert("vs-1676", node_is_empty(node));
+ coord_init_first_unit(insert_coord,
+ node);
+ } else {
+ warning("nikita-3136",
+ "Cannot lock child");
+ }
+ done_lh(&local_lh);
+ zrelse(node);
+ }
+ }
+ } else
+ result = PTR_ERR(op);
+ zput(node);
+ done_carry_pool(pool);
+ return result;
+}
+
+/**
+ * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
+ * @h: search handle
+ * @outcome: flag saying whether search has to restart or is done
+ *
+ * Handles search on twig level. If this function completes search itself then
+ * it returns 1. If search has to go one level down then 0 is returned. If
+ * error happens then LOOKUP_DONE is returned via @outcome and error code is
+ * saved in @h->result.
+ */
+int handle_eottl(cbk_handle *h, int *outcome)
+{
+ int result;
+ reiser4_key key;
+ coord_t *coord;
+
+ coord = h->coord;
+
+ if (h->level != TWIG_LEVEL ||
+ (coord_is_existing_item(coord) && item_is_internal(coord))) {
+ /* Continue to traverse tree downward. */
+ return 0;
+ }
+
+ /*
+ * make sure that @h->coord is set to twig node and that it is either
+ * set to extent item or after extent item
+ */
+ assert("vs-356", h->level == TWIG_LEVEL);
+ assert("vs-357", ({
+ coord_t lcoord;
+ coord_dup(&lcoord, coord);
+ check_me("vs-733", coord_set_to_left(&lcoord) == 0);
+ item_is_extent(&lcoord);
+ }
+ ));
+
+ if (*outcome == NS_FOUND) {
+ /* we have found desired key on twig level in extent item */
+ h->result = CBK_COORD_FOUND;
+ *outcome = LOOKUP_DONE;
+ return 1;
+ }
+
+ if (!(h->flags & CBK_FOR_INSERT)) {
+ /* tree traversal is not for insertion. Just return
+ CBK_COORD_NOTFOUND. */
+ h->result = CBK_COORD_NOTFOUND;
+ *outcome = LOOKUP_DONE;
+ return 1;
+ }
+
+ /* take a look at the item to the right of h -> coord */
+ result = is_next_item_internal(coord, h->key, h->active_lh);
+ if (unlikely(result < 0)) {
+ h->error = "get_right_neighbor failed";
+ h->result = result;
+ *outcome = LOOKUP_DONE;
+ return 1;
+ }
+ if (result == 0) {
+ /*
+ * item to the right is also an extent one. Allocate a new node
+ * and insert pointer to it after item h -> coord.
+ *
+ * This is a result of extents being located at the twig
+ * level. For explanation, see comment just above
+ * is_next_item_internal().
+ */
+ znode *loaded;
+
+ if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
+ /*
+ * we got node read locked, restart coord_by_key to
+ * have write lock on twig level
+ */
+ h->lock_level = TWIG_LEVEL;
+ h->lock_mode = ZNODE_WRITE_LOCK;
+ *outcome = LOOKUP_REST;
+ return 1;
+ }
+
+ loaded = coord->node;
+ result =
+ add_empty_leaf(coord, h->active_lh, h->key,
+ rd_key(coord, &key));
+ if (result) {
+ h->error = "could not add empty leaf";
+ h->result = result;
+ *outcome = LOOKUP_DONE;
+ return 1;
+ }
+ /* added empty leaf is locked (h->active_lh), its parent node
+ is unlocked, h->coord is set as EMPTY */
+ assert("vs-13", coord->between == EMPTY_NODE);
+ assert("vs-14", znode_is_write_locked(coord->node));
+ assert("vs-15",
+ WITH_DATA(coord->node, node_is_empty(coord->node)));
+ assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
+ assert("vs-17", coord->node == h->active_lh->node);
+ *outcome = LOOKUP_DONE;
+ h->result = CBK_COORD_NOTFOUND;
+ return 1;
+ } else if (result == 1) {
+ /*
+ * this is special case mentioned in the comment on
+ * tree.h:cbk_flags. We have found internal item immediately on
+ * the right of extent, and we are going to insert new item
+ * there. Key of item we are going to insert is smaller than
+ * leftmost key in the node pointed to by said internal item
+ * (otherwise search wouldn't come to the extent in the first
+ * place).
+ *
+ * This is a result of extents being located at the twig
+ * level. For explanation, see comment just above
+ * is_next_item_internal().
+ */
+ h->flags &= ~CBK_TRUST_DK;
+ } else {
+ assert("vs-8", result == 2);
+ *outcome = LOOKUP_REST;
+ return 1;
+ }
+ assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/estimate.c linux-5.10.2/fs/reiser4/estimate.c
--- linux-5.10.2.orig/fs/reiser4/estimate.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/estimate.c 2020-12-23 16:07:46.116813099 +0100
@@ -0,0 +1,172 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+#include "debug.h"
+#include "dformat.h"
+#include "tree.h"
+#include "carry.h"
+#include "inode.h"
+#include "plugin/cluster.h"
+#include "plugin/item/ctail.h"
+
+/* This returns how many nodes might get dirty and added nodes if @children
+ nodes are dirtied
+
+ Amount of internals which will get dirty or get allocated we estimate as 5%
+ of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and
+ the current block on the leaf level, 2 neighbour nodes + the current (or 1
+ neighbour and 1 new and the current) on twig level, 2 neighbour nodes on
+ upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level,
+ 2 on upper + 1 for root.
+
+ Do not calculate the current node of the lowest level here - this is overhead
+ only.
+
+ children is almost always 1 here. Exception is flow insertion
+*/
+static reiser4_block_nr
+max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
+{
+ reiser4_block_nr ten_percent;
+
+ ten_percent = ((103 * childen) >> 10);
+
+ /* If we have too many balancings at the time, tree height can raise on
+ more then 1. Assume that if tree_height is 5, it can raise on 1 only.
+ */
+ return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
+}
+
+/* this returns maximal possible number of nodes which can be modified plus
+ number of new nodes which can be required to perform insertion of one item
+ into the tree */
+/* it is only called when tree height changes, or gets initialized */
+reiser4_block_nr calc_estimate_one_insert(tree_level height)
+{
+ return 1 + max_balance_overhead(1, height);
+}
+
+reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
+{
+ return tree->estimate_one_insert;
+}
+
+/* this returns maximal possible number of nodes which can be modified plus
+ number of new nodes which can be required to perform insertion of one unit
+ into an item in the tree */
+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
+{
+ /* estimate insert into item just like item insertion */
+ return tree->estimate_one_insert;
+}
+
+reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
+{
+ /* on item removal reiser4 does not try to pack nodes more complact, so,
+ only one node may be dirtied on leaf level */
+ return tree->estimate_one_insert;
+}
+
+/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and
+ dirty 3 existing nodes (insert point and both its neighbors).
+ Max_balance_overhead should estimate number of blocks which may change/get
+ added on internal levels */
+reiser4_block_nr estimate_insert_flow(tree_level height)
+{
+ return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
+ CARRY_FLOW_NEW_NODES_LIMIT,
+ height);
+}
+
+/* returnes max number of nodes can be occupied by disk cluster */
+static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped)
+{
+ int per_cluster;
+ per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
+ return 3 + per_cluster +
+ max_balance_overhead(3 + per_cluster,
+ REISER4_MAX_ZTREE_HEIGHT);
+}
+
+/* how many nodes might get dirty and added
+ during insertion of a disk cluster */
+reiser4_block_nr estimate_insert_cluster(struct inode *inode)
+{
+ return estimate_cluster(inode, 1); /* 24 */
+}
+
+/* how many nodes might get dirty and added
+ during update of a (prepped or unprepped) disk cluster */
+reiser4_block_nr estimate_update_cluster(struct inode *inode)
+{
+ return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
+}
+
+/* How many nodes occupied by a disk cluster might get dirty.
+ Note that this estimation is not precise (i.e. disk cluster
+ can occupy more nodes).
+ Q: Why we don't use precise estimation?
+ A: 1.Because precise estimation is fairly bad: 65536 nodes
+ for 64K logical cluster, it means 256M of dead space on
+ a partition
+ 2.It is a very rare case when disk cluster occupies more
+ nodes then this estimation returns.
+*/
+reiser4_block_nr estimate_dirty_cluster(struct inode *inode)
+{
+ return cluster_nrpages(inode) + 4;
+}
+
+/**
+ * How many meta-data blocks are needed to write @count
+ * data pages to a striped file by extents.
+ */
+reiser4_block_nr estimate_write_stripe_meta(int count)
+{
+ reiser4_tree *tree = meta_subvol_tree();
+/*
+ * to write @count data pages to a file by extents we have to
+ * reserve disk space for:
+ *
+ * 1. find_file_item() may have to insert empty node to the tree
+ * (empty leaf node between two extent items). This requires:
+ * (a) 1 block for the leaf node;
+ * (b) number of formatted blocks which are necessary to perform
+ * insertion of an internal item into twig level.
+ *
+ * 2. for each of written pages there might be needed:
+ * number of blocks which might be necessary to insert or
+ * paste to an extent item.
+ *
+ * 3. stat data update.
+ */
+ return 1 /* (1a) */ +
+ 2 * estimate_one_insert_item(tree) /* (1b) + (3) */ +
+ count * estimate_one_insert_into_item(tree) /* (2) */;
+}
+
+/**
+ * How many meta-data blocks are needed to perform one iteration
+ * by migrate_extent()
+ */
+reiser4_block_nr estimate_migration_iter(void)
+{
+ reiser4_tree *tree = meta_subvol_tree();
+ /*
+ * 1 + estimate_one_insert_item(tree) for do_split_extent(),
+ * other is estimation above without stat-data update - for
+ * do_migrate_extent()
+ */
+ return 2 + 3 * estimate_one_insert_item(tree);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/export_ops.c linux-5.10.2/fs/reiser4/export_ops.c
--- linux-5.10.2.orig/fs/reiser4/export_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/export_ops.c 2020-12-23 16:07:46.116813099 +0100
@@ -0,0 +1,325 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "inode.h"
+#include "plugin/plugin.h"
+
+/*
+ * Supported file-handle types
+ */
+typedef enum {
+ FH_WITH_PARENT = 0x10, /* file handle with parent */
+ FH_WITHOUT_PARENT = 0x11 /* file handle without parent */
+} reiser4_fhtype;
+
+#define NFSERROR (255)
+
+/* initialize place-holder for object */
+static void object_on_wire_init(reiser4_object_on_wire *o)
+{
+ o->plugin = NULL;
+}
+
+/* finish with @o */
+static void object_on_wire_done(reiser4_object_on_wire *o)
+{
+ if (o->plugin != NULL)
+ o->plugin->wire.done(o);
+}
+
+/*
+ * read serialized object identity from @addr and store information about
+ * object in @obj. This is dual to encode_inode().
+ */
+static char *decode_inode(struct super_block *s, char *addr,
+ reiser4_object_on_wire * obj)
+{
+ file_plugin *fplug;
+
+ /* identifier of object plugin is stored in the first two bytes,
+ * followed by... */
+ fplug = file_plugin_by_disk_id((d16 *) addr);
+ if (fplug != NULL) {
+ addr += sizeof(d16);
+ obj->plugin = fplug;
+ assert("nikita-3520", fplug->wire.read != NULL);
+ /* plugin specific encoding of object identity. */
+ addr = fplug->wire.read(addr, obj);
+ } else
+ addr = ERR_PTR(RETERR(-EINVAL));
+ return addr;
+}
+
+static struct dentry *reiser4_get_dentry(struct super_block *super,
+ void *data);
+/**
+ * reiser4_decode_fh: decode on-wire object - helper function
+ * for fh_to_dentry, fh_to_parent export operations;
+ * @super: super block;
+ * @addr: onwire object to be decoded;
+ *
+ * Returns dentry referring to the object being decoded.
+ */
+static struct dentry *reiser4_decode_fh(struct super_block * super,
+ char * addr)
+{
+ reiser4_object_on_wire object;
+
+ object_on_wire_init(&object);
+
+ addr = decode_inode(super, addr, &object);
+ if (!IS_ERR(addr)) {
+ struct dentry *d;
+ d = reiser4_get_dentry(super, &object);
+ if (d != NULL && !IS_ERR(d))
+ /* FIXME check for -ENOMEM */
+ reiser4_get_dentry_fsdata(d)->stateless = 1;
+ addr = (char *)d;
+ }
+ object_on_wire_done(&object);
+ return (void *)addr;
+}
+
+static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ reiser4_context *ctx;
+ struct dentry *d;
+
+ assert("edward-1536",
+ fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
+
+ ctx = reiser4_init_context(sb);
+ if (IS_ERR(ctx))
+ return (struct dentry *)ctx;
+
+ d = reiser4_decode_fh(sb, (char *)fid->raw);
+
+ reiser4_exit_context(ctx);
+ return d;
+}
+
+static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ char * addr;
+ struct dentry * d;
+ reiser4_context *ctx;
+ file_plugin *fplug;
+
+ if (fh_type == FH_WITHOUT_PARENT)
+ return NULL;
+ assert("edward-1537", fh_type == FH_WITH_PARENT);
+
+ ctx = reiser4_init_context(sb);
+ if (IS_ERR(ctx))
+ return (struct dentry *)ctx;
+ addr = (char *)fid->raw;
+ /* extract 2-bytes file plugin id */
+ fplug = file_plugin_by_disk_id((d16 *)addr);
+ if (fplug == NULL) {
+ d = ERR_PTR(RETERR(-EINVAL));
+ goto exit;
+ }
+ addr += sizeof(d16);
+ /* skip previously encoded object */
+ addr = fplug->wire.read(addr, NULL /* skip */);
+ if (IS_ERR(addr)) {
+ d = (struct dentry *)addr;
+ goto exit;
+ }
+ /* @extract and decode parent object */
+ d = reiser4_decode_fh(sb, addr);
+ exit:
+ reiser4_exit_context(ctx);
+ return d;
+}
+
+/*
+ * Object serialization support.
+ *
+ * To support knfsd file system provides export_operations that are used to
+ * construct and interpret NFS file handles. As a generalization of this,
+ * reiser4 object plugins have serialization support: it provides methods to
+ * create on-wire representation of identity of reiser4 object, and
+ * re-create/locate object given its on-wire identity.
+ *
+ */
+
+/*
+ * return number of bytes that on-wire representation of @inode's identity
+ * consumes.
+ */
+static int encode_inode_size(struct inode *inode)
+{
+ assert("nikita-3514", inode != NULL);
+ assert("nikita-3515", inode_file_plugin(inode) != NULL);
+ assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
+
+ return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
+}
+
+/*
+ * store on-wire representation of @inode's identity at the area beginning at
+ * @start.
+ */
+static char *encode_inode(struct inode *inode, char *start)
+{
+ assert("nikita-3517", inode != NULL);
+ assert("nikita-3518", inode_file_plugin(inode) != NULL);
+ assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
+
+ /*
+ * first, store two-byte identifier of object plugin, then
+ */
+ save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
+ (d16 *) start);
+ start += sizeof(d16);
+ /*
+ * call plugin to serialize object's identity
+ */
+ return inode_file_plugin(inode)->wire.write(inode, start);
+}
+
+/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
+ * returned if file handle can not be stored */
+/**
+ * reiser4_encode_fh - encode_fh of export operations
+ * @dentry:
+ * @fh:
+ * @lenp:
+ * @need_parent:
+ *
+ */
+static int
+reiser4_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+ struct inode *parent)
+{
+ char *addr;
+ int need;
+ int delta;
+ int result;
+ bool need_parent;
+ reiser4_context *ctx;
+
+ /*
+ * knfsd asks as to serialize @inode, and, optionally its
+ * parent @parent (if it is non-NULL).
+ *
+ * encode_inode() and encode_inode_size() is used to build
+ * representation of object and its parent. All hard work is done by
+ * object plugins.
+ */
+ need_parent = (parent != NULL);
+ addr = (char *)fh;
+
+ need = encode_inode_size(inode);
+ if (need < 0)
+ return NFSERROR;
+ if (need_parent) {
+ delta = encode_inode_size(parent);
+ if (delta < 0)
+ return NFSERROR;
+ need += delta;
+ }
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (need <= sizeof(__u32) * (*lenp)) {
+ addr = encode_inode(inode, addr);
+ if (need_parent)
+ addr = encode_inode(parent, addr);
+
+ /* store in lenp number of 32bit words required for file
+ * handle. */
+ *lenp = (need + sizeof(__u32) - 1) >> 2;
+ result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
+ } else
+ /* no enough space in file handle */
+ result = NFSERROR;
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/**
+ * reiser4_get_dentry_parent - get_parent of export operations
+ * @child:
+ *
+ */
+static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
+{
+ struct inode *dir;
+ dir_plugin *dplug;
+ struct dentry *result;
+ reiser4_context *ctx;
+
+ assert("nikita-3527", child != NULL);
+
+ dir = child->d_inode;
+ assert("nikita-3529", dir != NULL);
+
+ ctx = reiser4_init_context(dir->i_sb);
+ if (IS_ERR(ctx))
+ return (void *)ctx;
+
+ dplug = inode_dir_plugin(dir);
+ assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
+
+ if (unlikely(dplug == NULL)) {
+ reiser4_exit_context(ctx);
+ return ERR_PTR(RETERR(-ENOTDIR));
+ }
+ result = dplug->get_parent(dir);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/**
+ * reiser4_get_dentry - get_dentry of export operations
+ * @super:
+ * @data:
+ *
+ *
+ */
+static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
+{
+ reiser4_object_on_wire *o;
+
+ assert("nikita-3522", super != NULL);
+ assert("nikita-3523", data != NULL);
+ /*
+ * this is only supposed to be called by
+ *
+ * reiser4_decode_fh->find_exported_dentry
+ *
+ * so, reiser4_context should be here already.
+ */
+ assert("nikita-3526", is_in_reiser4_context());
+
+ o = (reiser4_object_on_wire *)data;
+ assert("nikita-3524", o->plugin != NULL);
+ assert("nikita-3525", o->plugin->wire.get != NULL);
+
+ return o->plugin->wire.get(super, o);
+}
+
+struct export_operations reiser4_export_operations = {
+ .encode_fh = reiser4_encode_fh,
+ .fh_to_dentry = reiser4_fh_to_dentry,
+ .fh_to_parent = reiser4_fh_to_parent,
+ .get_parent = reiser4_get_dentry_parent,
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/flush.c linux-5.10.2/fs/reiser4/flush.c
--- linux-5.10.2.orig/fs/reiser4/flush.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/flush.c 2020-12-23 16:07:46.116813099 +0100
@@ -0,0 +1,3794 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/plugin.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "carry.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "super.h"
+#include "entd.h"
+#include "reiser4.h"
+#include "flush.h"
+#include "writeout.h"
+
+#include <asm/atomic.h>
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/mm.h> /* for struct page */
+#include <linux/bio.h> /* for struct bio */
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+
+/* IMPLEMENTATION NOTES */
+
+/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
+ assigning a total order to the nodes of the tree in which the parent is
+ placed before its children, which are ordered (recursively) in left-to-right
+ order. When we speak of a "parent-first preceder", it describes the node that
+ "came before in forward parent-first order". When we speak of a "parent-first
+ follower", it describes the node that "comes next in parent-first order"
+ (alternatively the node that "came before in reverse parent-first order").
+
+ The following pseudo-code prints the nodes of a tree in forward parent-first
+ order:
+
+ void parent_first (node)
+ {
+ print_node (node);
+ if (node->level > leaf) {
+ for (i = 0; i < num_children; i += 1) {
+ parent_first (node->child[i]);
+ }
+ }
+ }
+*/
+
+/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE? The idea is to optimize block
+ allocation so that a left-to-right scan of the tree's data (i.e., the leaves
+ in left-to-right order) can be accomplished with sequential reads, which
+ results in reading nodes in their parent-first order. This is a
+ read-optimization aspect of the flush algorithm, and there is also a
+ write-optimization aspect, which is that we wish to make large sequential
+ writes to the disk by allocating or reallocating blocks so that they can be
+ written in sequence. Sometimes the read-optimization and write-optimization
+ goals conflict with each other, as we discuss in more detail below.
+*/
+
+/* STATE BITS: The flush code revolves around the state of the jnodes it covers.
+ Here are the relevant jnode->state bits and their relevence to flush:
+
+ JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
+ written it must be allocated first. In order to be considered allocated,
+ the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
+ two bits are exclusive, and all dirtied jnodes eventually have one of these
+ bits set during each transaction.
+
+ JNODE_CREATED: The node was freshly created in its transaction and has no
+ previous block address, so it is unconditionally assigned to be relocated,
+ although this is mainly for code-convenience. It is not being 'relocated'
+ from anything, but in almost every regard it is treated as part of the
+ relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
+ set, so the actual relocate can be distinguished from the
+ created-and-allocated set easily: relocate-set members (belonging to the
+ preserve-set) have (JNODE_RELOC) set and created-set members which have no
+ previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
+
+ JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
+ made the decision to maintain the pre-existing location for this node and
+ it will be written to the wandered-log.
+
+ JNODE_RELOC: The flush algorithm made the decision to relocate this block
+ (if it was not created, see note above). A block with JNODE_RELOC set is
+ eligible for early-flushing and may be submitted during flush_empty_queues.
+ When the JNODE_RELOC bit is set on a znode, the parent node's internal item
+ is modified and the znode is rehashed.
+
+ JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
+ scans the node and calls plugin->f.squeeze() method for its items. By this
+ technology we update disk clusters of cryptcompress objects. Also if
+ leftmost point that was found by flush scan has this flag (races with
+ write(), rare case) the flush algorythm makes the decision to pass it to
+ squalloc() in spite of its flushprepped status for squeezing, not for
+ repeated allocation.
+
+ JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
+ into its flush queue. This means the jnode is not on any clean or dirty
+ list, instead it is moved to one of the flush queue (see flush_queue.h)
+ object private list. This prevents multiple concurrent flushes from
+ attempting to start flushing from the same node.
+
+ (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
+ squeeze-and-allocate on a node while its children are actively being
+ squeezed and allocated. This flag was created to avoid submitting a write
+ request for a node while its children are still being allocated and
+ squeezed. Then flush queue was re-implemented to allow unlimited number of
+ nodes be queued. This flag support was commented out in source code because
+ we decided that there was no reason to submit queued nodes before
+ jnode_flush() finishes. However, current code calls fq_write() during a
+ slum traversal and may submit "busy nodes" to disk. Probably we can
+ re-enable the JNODE_FLUSH_BUSY bit support in future.
+
+ With these state bits, we describe a test used frequently in the code below,
+ jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
+ The test for "flushprepped" returns true if any of the following are true:
+
+ - The node is not dirty
+ - The node has JNODE_RELOC set
+ - The node has JNODE_OVRWR set
+
+ If either the node is not dirty or it has already been processed by flush
+ (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
+ jnode_is_flushprepped() returns true then flush has work to do on that node.
+*/
+
+/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
+ flushprepped twice (unless an explicit call to flush_unprep is made as
+ described in detail below). For example a node is dirtied, allocated, and
+ then early-flushed to disk and set clean. Before the transaction commits, the
+ page is dirtied again and, due to memory pressure, the node is flushed again.
+ The flush algorithm will not relocate the node to a new disk location, it
+ will simply write it to the same, previously relocated position again.
+*/
+
+/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
+ where we start at a leaf node and allocate in parent-first order by iterating
+ to the right. At each step of the iteration, we check for the right neighbor.
+ Before advancing to the right neighbor, we check if the current position and
+ the right neighbor share the same parent. If they do not share the same
+ parent, the parent is allocated before the right neighbor.
+
+ This process goes recursively up the tree and squeeze nodes level by level as
+ long as the right neighbor and the current position have different parents,
+ then it allocates the right-neighbors-with-different-parents on the way back
+ down. This process is described in more detail in
+ flush_squalloc_changed_ancestor and the recursive function
+ squalloc_one_changed_ancestor. But the purpose here is not to discuss the
+ specifics of the bottom-up approach as it is to contrast the bottom-up and
+ top-down approaches.
+
+ The top-down algorithm was implemented earlier (April-May 2002). In the
+ top-down approach, we find a starting point by scanning left along each level
+ past dirty nodes, then going up and repeating the process until the left node
+ and the parent node are clean. We then perform a parent-first traversal from
+ the starting point, which makes allocating in parent-first order trivial.
+ After one subtree has been allocated in this manner, we move to the right,
+ try moving upward, then repeat the parent-first traversal.
+
+ Both approaches have problems that need to be addressed. Both are
+ approximately the same amount of code, but the bottom-up approach has
+ advantages in the order it acquires locks which, at the very least, make it
+ the better approach. At first glance each one makes the other one look
+ simpler, so it is important to remember a few of the problems with each one.
+
+ Main problem with the top-down approach: When you encounter a clean child
+ during the parent-first traversal, what do you do? You would like to avoid
+ searching through a large tree of nodes just to find a few dirty leaves at
+ the bottom, and there is not an obvious solution. One of the advantages of
+ the top-down approach is that during the parent-first traversal you check
+ every child of a parent to see if it is dirty. In this way, the top-down
+ approach easily handles the main problem of the bottom-up approach:
+ unallocated children.
+
+ The unallocated children problem is that before writing a node to disk we
+ must make sure that all of its children are allocated. Otherwise, the writing
+ the node means extra I/O because the node will have to be written again when
+ the child is finally allocated.
+
+ WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
+ this should not cause any file system corruption, it only degrades I/O
+ performance because a node may be written when it is sure to be written at
+ least one more time in the same transaction when the remaining children are
+ allocated. What follows is a description of how we will solve the problem.
+*/
+
+/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
+ then proceeding in parent first order, allocate some of its left-children,
+ then encounter a clean child in the middle of the parent. We do not allocate
+ the clean child, but there may remain unallocated (dirty) children to the
+ right of the clean child. If we were to stop flushing at this moment and
+ write everything to disk, the parent might still contain unallocated
+ children.
+
+ We could try to allocate all the descendents of every node that we allocate,
+ but this is not necessary. Doing so could result in allocating the entire
+ tree: if the root node is allocated then every unallocated node would have to
+ be allocated before flushing. Actually, we do not have to write a node just
+ because we allocate it. It is possible to allocate but not write a node
+ during flush, when it still has unallocated children. However, this approach
+ is probably not optimal for the following reason.
+
+ The flush algorithm is designed to allocate nodes in parent-first order in an
+ attempt to optimize reads that occur in the same order. Thus we are
+ read-optimizing for a left-to-right scan through all the leaves in the
+ system, and we are hoping to write-optimize at the same time because those
+ nodes will be written together in batch. What happens, however, if we assign
+ a block number to a node in its read-optimized order but then avoid writing
+ it because it has unallocated children? In that situation, we lose out on the
+ write-optimization aspect because a node will have to be written again to the
+ its location on the device, later, which likely means seeking back to that
+ location.
+
+ So there are tradeoffs. We can choose either:
+
+ A. Allocate all unallocated children to preserve both write-optimization and
+ read-optimization, but this is not always desirable because it may mean
+ having to allocate and flush very many nodes at once.
+
+ B. Defer writing nodes with unallocated children, keep their read-optimized
+ locations, but sacrifice write-optimization because those nodes will be
+ written again.
+
+ C. Defer writing nodes with unallocated children, but do not keep their
+ read-optimized locations. Instead, choose to write-optimize them later, when
+ they are written. To facilitate this, we "undo" the read-optimized allocation
+ that was given to the node so that later it can be write-optimized, thus
+ "unpreparing" the flush decision. This is a case where we disturb the
+ FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
+ flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
+ if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
+ its block location, and set the JNODE_CREATED bit, effectively setting the
+ node back to an unallocated state.
+
+ We will take the following approach in v4.0: for twig nodes we will always
+ finish allocating unallocated children (A). For nodes with (level > TWIG)
+ we will defer writing and choose write-optimization (C).
+
+ To summarize, there are several parts to a solution that avoids the problem
+ with unallocated children:
+
+ FIXME-ZAM: Still no one approach is implemented to eliminate the
+ "UNALLOCATED CHILDREN" problem because there was an experiment which was done
+ showed that we have 1-2 nodes with unallocated children for thousands of
+ written nodes. The experiment was simple like coping/deletion of linux kernel
+ sources. However the problem can arise in more complex tests. I think we have
+ jnode_io_hook to insert a check for unallocated children and see what kind of
+ problem we have.
+
+ 1. When flush reaches a stopping point (e.g. a clean node) it should continue
+ calling squeeze-and-allocate on any remaining unallocated children.
+ FIXME: Difficulty to implement: should be simple -- amounts to adding a while
+ loop to jnode_flush, see comments in that function.
+
+ 2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
+ may still have unallocated children. If the twig level has unallocated
+ children it is an assertion failure. If a higher-level node has unallocated
+ children, then it should be explicitly de-allocated by a call to
+ flush_unprep().
+ FIXME: Difficulty to implement: should be simple.
+
+ 3. (CPU-Optimization) Checking whether a node has unallocated children may
+ consume more CPU cycles than we would like, and it is possible (but medium
+ complexity) to optimize this somewhat in the case where large sub-trees are
+ flushed. The following observation helps: if both the left- and
+ right-neighbor of a node are processed by the flush algorithm then the node
+ itself is guaranteed to have all of its children allocated. However, the cost
+ of this check may not be so expensive after all: it is not needed for leaves
+ and flush can guarantee this property for twigs. That leaves only (level >
+ TWIG) nodes that have to be checked, so this optimization only helps if at
+ least three (level > TWIG) nodes are flushed in one pass, and the savings
+ will be very small unless there are many more (level > TWIG) nodes. But if
+ there are many (level > TWIG) nodes then the number of blocks being written
+ will be very large, so the savings may be insignificant. That said, the idea
+ is to maintain both the left and right edges of nodes that are processed in
+ flush. When flush_empty_queue() is called, a relatively simple test will
+ tell whether the (level > TWIG) node is on the edge. If it is on the edge,
+ the slow check is necessary, but if it is in the interior then it can be
+ assumed to have all of its children allocated. FIXME: medium complexity to
+ implement, but simple to verify given that we must have a slow check anyway.
+
+ 4. (Optional) This part is optional, not for v4.0--flush should work
+ independently of whether this option is used or not. Called RAPID_SCAN, the
+ idea is to amend the left-scan operation to take unallocated children into
+ account. Normally, the left-scan operation goes left as long as adjacent
+ nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
+ which point it stops and begins flushing. But scan-left may stop at a
+ position where there are unallocated children to the left with the same
+ parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
+ after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
+ FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
+ all the interior children of a node--if the leftmost child of a twig is
+ dirty, check its left neighbor (the rightmost child of the twig to the left).
+ If the left neighbor of the leftmost child is also dirty, then continue the
+ scan at the left twig and repeat. This option will cause flush to allocate
+ more twigs in a single pass, but it also has the potential to write many more
+ nodes than would otherwise be written without the RAPID_SCAN option.
+ RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
+*/
+
+/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
+ the starting point for flush is a leaf node, but actually the flush code
+ cares very little about whether or not this is true. It is possible that all
+ the leaf nodes are flushed and dirty parent nodes still remain, in which case
+ jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
+ the argument node as if it were a leaf, even when it is not. This is a simple
+ approach, and there may be a more optimal policy but until a problem with
+ this approach is discovered, simplest is probably best.
+
+ NOTE: In this case, the ordering produced by flush is parent-first only if
+ you ignore the leaves. This is done as a matter of simplicity and there is
+ only one (shaky) justification. When an atom commits, it flushes all leaf
+ level nodes first, followed by twigs, and so on. With flushing done in this
+ order, if flush is eventually called on a non-leaf node it means that
+ (somehow) we reached a point where all leaves are clean and only internal
+ nodes need to be flushed. If that it the case, then it means there were no
+ leaves that were the parent-first preceder/follower of the parent. This is
+ expected to be a rare case, which is why we do nothing special about it.
+ However, memory pressure may pass an internal node to flush when there are
+ still dirty leaf nodes that need to be flushed, which could prove our
+ original assumptions "inoperative". If this needs to be fixed, then
+ scan_left/right should have special checks for the non-leaf levels. For
+ example, instead of passing from a node to the left neighbor, it should pass
+ from the node to the left neighbor's rightmost descendent (if dirty).
+
+*/
+
+/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
+ chunks, dirtying everything and putting it into a transaction. We tell the
+ allocator to allocate the blocks as far as possible towards one end of the
+ logical device--the left (starting) end of the device if we are walking from
+ left to right, the right end of the device if we are walking from right to
+ left. We then make passes in alternating directions, and as we do this the
+ device becomes sorted such that tree order and block number order fully
+ correlate.
+
+ Resizing is done by shifting everything either all the way to the left or all
+ the way to the right, and then reporting the last block.
+*/
+
+/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
+ This descibes the policy from the highest level:
+
+ The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
+ nodes on the leaf level during flush-scan (right, left), then we
+ unconditionally decide to relocate leaf nodes.
+
+ Otherwise, there are two contexts in which we make a decision to relocate:
+
+ 1. The REVERSE PARENT-FIRST context: Implemented in reverse_allocate
+ During the initial stages of flush, after scan-right completes, we want to
+ ask the question: should we relocate this leaf node and thus dirty the parent
+ node. Then if the node is a leftmost child its parent is its own parent-first
+ preceder, thus we repeat the question at the next level up, and so on. In
+ these cases we are moving in the reverse-parent first direction.
+
+ There is another case which is considered the reverse direction, which comes
+ at the end of a twig in reverse_relocate_end_of_twig(). As we finish
+ processing a twig we may reach a point where there is a clean twig to the
+ right with a dirty leftmost child. In this case, we may wish to relocate the
+ child by testing if it should be relocated relative to its parent.
+
+ 2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
+ in allocate_znode. What distinguishes the forward parent-first case from the
+ reverse-parent first case is that the preceder has already been allocated in
+ the forward case, whereas in the reverse case we don't know what the preceder
+ is until we finish "going in reverse". That simplifies the forward case
+ considerably, and there we actually use the block allocator to determine
+ whether, e.g., a block closer to the preceder is available.
+*/
+
+/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
+ once we finish scan-left and find a starting point, if the parent's left
+ neighbor is dirty then squeeze the parent's left neighbor and the parent.
+ This may change the flush-starting-node's parent. Repeat until the child's
+ parent is stable. If the child is a leftmost child, repeat this left-edge
+ squeezing operation at the next level up. Note that we cannot allocate
+ extents during this or they will be out of parent-first order. There is also
+ some difficult coordinate maintenence issues. We can't do a tree search to
+ find coordinates again (because we hold locks), we have to determine them
+ from the two nodes being squeezed. Looks difficult, but has potential to
+ increase space utilization. */
+
+static struct kmem_cache *_fbi_slab = NULL;
+
+/* Flush-scan helper functions. */
+static void scan_init(flush_scan * scan, flush_pos_t *pos);
+static void scan_done(flush_scan * scan);
+
+/* Flush-scan algorithm. */
+static int scan_left(flush_scan *scan, flush_scan *right, jnode *node);
+static int scan_right(flush_scan *scan, jnode *node);
+static int do_scan(flush_scan * scan, flush_scan * other);
+static int scan_formatted(flush_scan * scan);
+static int lock_parent_and_scan_upper_level(flush_scan * scan, flush_scan * other);
+static int scan_by_coord(flush_scan * scan);
+
+/* Initial flush-point ancestor allocation. */
+static int alloc_pos_and_ancestors(flush_pos_t *pos);
+static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos);
+static int find_preceder(const coord_t *coord_in, flush_brick_info *fbi);
+static struct flush_brick_info *find_fbi(const struct rb_root *root,
+ u32 brick_id);
+static void insert_fbi(struct rb_root *root, flush_brick_info *this);
+static flush_brick_info *grab_fbi(struct rb_root *infos,
+ flush_brick_info *mfbi, u32 brick_id);
+static void init_fbi(flush_brick_info *fbi, u32 subv_id);
+static void done_all_fbi(struct rb_root *infos, struct flush_brick_info *mfbi);
+
+/* Main flush algorithm.
+ Note on abbreviation: "squeeze and allocate" == "squalloc". */
+static int squalloc(flush_pos_t *pos);
+static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
+
+/* Flush squeeze implementation. */
+static int squeeze_right_non_twig(znode * left, znode * right);
+static int shift_one_internal_unit(znode * left, znode * right);
+
+/* Parent re-allocation policy */
+static int check_parent_for_realloc(jnode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos);
+
+/* Flush allocate write-queueing functions: */
+static int allocate_znode(znode * node, const coord_t *parent_coord,
+ flush_pos_t *pos);
+static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
+
+/* Flush helper functions: */
+static int jnode_lock_parent_coord(jnode * node,
+ coord_t *coord,
+ lock_handle * parent_lh,
+ load_count * parent_zh,
+ znode_lock_mode mode, int try);
+static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
+ znode_lock_mode mode, int check_dirty, int expected);
+static int znode_same_parents(znode * a, znode * b);
+
+static int znode_check_flushprepped(znode * node)
+{
+ return jnode_check_flushprepped(ZJNODE(node));
+}
+static void update_znode_dkeys(znode * left, znode * right);
+
+/* Flush position functions */
+static void pos_init(flush_pos_t *pos);
+static int pos_valid(flush_pos_t *pos);
+static void pos_done(flush_pos_t *pos);
+static int pos_stop(flush_pos_t *pos);
+
+/* check that @org is first jnode extent unit, if extent is unallocated,
+ * because all jnodes of unallocated extent are dirty and of the same atom. */
+#define checkchild(scan) \
+assert("nikita-3435", \
+ ergo(scan->direction == LEFT_SIDE && \
+ (scan->parent_coord.node->level == TWIG_LEVEL) && \
+ jnode_is_unformatted(scan->node) && \
+ extent_is_unallocated(&scan->parent_coord), \
+ extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
+
+/* This flush_cnt variable is used to track the number of concurrent flush
+ operations, useful for debugging. It is initialized in txnmgr.c out of
+ laziness (because flush has no static initializer function...) */
+ON_DEBUG(atomic_t flush_cnt;)
+
+/* check fs backing device for write congestion */
+static int check_write_congestion(void)
+{
+ struct super_block *sb;
+ struct backing_dev_info *bdi;
+
+ sb = reiser4_get_current_sb();
+ bdi = inode_to_bdi(reiser4_get_super_fake(sb));
+ return bdi_write_congested(bdi);
+}
+
+/* conditionally write flush queue */
+static int write_prepped_nodes(flush_pos_t *pos)
+{
+ int ret;
+
+ assert("zam-831", pos);
+ assert("zam-832", pos->fq);
+
+ if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
+ return 0;
+
+ if (check_write_congestion())
+ return 0;
+
+ ret = reiser4_write_fq(pos->fq, pos->nr_written,
+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
+ return ret;
+}
+
+/* Proper release all flush pos. resources then move flush position to new
+ locked node */
+static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock,
+ load_count * new_load, const coord_t *new_coord)
+{
+ assert("zam-857", new_lock->node == new_load->node);
+
+ if (new_coord) {
+ assert("zam-858", new_coord->node == new_lock->node);
+ coord_dup(&pos->coord, new_coord);
+ } else {
+ coord_init_first_unit(&pos->coord, new_lock->node);
+ }
+
+ if (pos->child) {
+ jput(pos->child);
+ pos->child = NULL;
+ }
+
+ move_load_count(&pos->load, new_load);
+ done_lh(&pos->lock);
+ move_lh(&pos->lock, new_lock);
+}
+
+/* delete empty node which link from the parent still exists. */
+static inline int delete_empty_node(znode *node)
+{
+ reiser4_key smallest_removed;
+
+ assert("zam-1019", node != NULL);
+ assert("zam-1020", node_is_empty(node));
+ assert("zam-1023", znode_is_wlocked(node));
+
+ return reiser4_delete_node(node, &smallest_removed, NULL, 1);
+}
+
+/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
+static int prepare_flush_pos(flush_pos_t *pos, jnode *leftmost)
+{
+ int ret;
+ load_count load;
+ lock_handle lock;
+
+ init_lh(&lock);
+ init_load_count(&load);
+
+ if (jnode_is_znode(leftmost)) {
+ ret = longterm_lock_znode(&lock, JZNODE(leftmost),
+ ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
+ if (ret)
+ return ret;
+
+ ret = incr_load_count_znode(&load, JZNODE(leftmost));
+ if (ret)
+ return ret;
+
+ pos->state =
+ (jnode_get_level(leftmost) ==
+ LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
+ move_flush_pos(pos, &lock, &load, NULL);
+ } else {
+ coord_t parent_coord;
+ ret = jnode_lock_parent_coord(leftmost, &parent_coord, &lock,
+ &load, ZNODE_WRITE_LOCK, 0);
+ if (ret)
+ goto done;
+ if (!item_is_extent(&parent_coord)) {
+ /*
+ * file was converted to tail,
+ * @leftmost became HEARD_BANSHEE,
+ * we found internal item
+ */
+ ret = -EAGAIN;
+ goto done;
+ }
+ pos->state = POS_ON_EPOINT;
+ move_flush_pos(pos, &lock, &load, &parent_coord);
+ pos->child = jref(leftmost);
+
+ if (extent_is_unallocated(&parent_coord) &&
+ extent_unit_index(&parent_coord) != index_jnode(leftmost)) {
+ /* @leftmost is not first child of its parent unit.
+ This may happen because longerm lock of its parent
+ node was released between scan_left and scan_right.
+ For now work around this having flush to repeat */
+ ret = -EAGAIN;
+ }
+ }
+
+done:
+ done_load_count(&load);
+ done_lh(&lock);
+ return ret;
+}
+
+/**
+ * Make relocation decision on a brick basing on statictics accumulated
+ * by flush scan for that brick
+ */
+int leaf_should_relocate(flush_pos_t *pos, u32 subv_id)
+{
+ flush_brick_info *fbi;
+
+ fbi = find_fbi(&pos->bricks_info, subv_id);
+ return fbi == NULL ? 0 : __leaf_should_relocate(fbi);
+}
+
+/* TODO LIST (no particular order): */
+/* I have labelled most of the legitimate FIXME comments in this file with
+ letters to indicate which issue they relate to. There are a few miscellaneous
+ FIXMEs with specific names mentioned instead that need to be
+ inspected/resolved. */
+/* B. There is an issue described in reverse_allocate having to do with an
+ imprecise is_preceder? check having to do with partially-dirty extents. The
+ code that sets preceder hints and computes the preceder is basically
+ untested. Careful testing needs to be done that preceder calculations are
+ done correctly, since if it doesn't affect correctness we will not catch this
+ stuff during regular testing. */
+/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
+ these are considered expected but unlikely conditions. Flush currently
+ returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
+ any of these in jnode_flush(). Many of the calls that may produce one of
+ these return values (i.e., longterm_lock_znode, reiser4_get_parent,
+ reiser4_get_neighbor, ...) check some of these values themselves and, for
+ instance, stop flushing instead of resulting in a restart. If any of these
+ results are true error conditions then flush will go into a busy-loop, as we
+ noticed during testing when a corrupt tree caused find_child_ptr to return
+ ENOENT. It needs careful thought and testing of corner conditions.
+*/
+/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
+ created block is assigned a block number then early-flushed to disk. It is
+ dirtied again and flush is called again. Concurrently, that block is deleted,
+ and the de-allocation of its block number does not need to be deferred, since
+ it is not part of the preserve set (i.e., it didn't exist before the
+ transaction). I think there may be a race condition where flush writes the
+ dirty, created block after the non-deferred deallocated block number is
+ re-allocated, making it possible to write deleted data on top of non-deleted
+ data. Its just a theory, but it needs to be thought out. */
+/* F. bio_alloc() failure is not handled gracefully. */
+/* G. Unallocated children. */
+/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
+ blocks. */
+/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
+
+/* JNODE_FLUSH: MAIN ENTRY POINT */
+/* This is the main entry point for flushing a jnode and its dirty neighborhood
+ (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
+ to write dirty blocks to disk, it happens when Linux VM decides to reduce
+ number of dirty pages or as a part of transaction commit.
+
+ Our objective here is to prep and flush the slum the jnode belongs to. We
+ want to squish the slum together, and allocate the nodes in it as we squish
+ because allocation of children affects squishing of parents.
+
+ The "argument" @node tells flush where to start. From there, flush finds the
+ left edge of the slum, and calls squalloc (in which nodes are squeezed and
+ allocated). To find a "better place" to start squalloc first we perform a
+ flush_scan.
+
+ Flush-scanning may be performed in both left and right directions, but for
+ different purposes. When scanning to the left, we are searching for a node
+ that precedes a sequence of parent-first-ordered nodes which we will then
+ flush in parent-first order. During flush-scanning, we also take the
+ opportunity to count the number of consecutive leaf nodes. If this number is
+ past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
+ reallocate leaf nodes (thus favoring write-optimization).
+
+ Since the flush argument node can be anywhere in a sequence of dirty leaves,
+ there may also be dirty nodes to the right of the argument. If the scan-left
+ operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
+ follow it with a right-scan operation to see whether there is, in fact,
+ enough nodes to meet the relocate threshold. Each right- and left-scan
+ operation uses a single flush_scan object.
+
+ After left-scan and possibly right-scan, we prepare a flush_position object
+ with the starting flush point or parent coordinate, which was determined
+ using scan-left.
+
+ Next we call the main flush routine, squalloc, which iterates along the leaf
+ level, squeezing and allocating nodes (and placing them into the flush
+ queue).
+
+ After squalloc returns we take extra steps to ensure that all the children
+ of the final twig node are allocated--this involves repeating squalloc
+ until we finish at a twig with no unallocated children.
+
+ Finally, we call flush_empty_queue to submit write-requests to disk. If we
+ encounter any above-twig nodes during flush_empty_queue that still have
+ unallocated children, we flush_unprep them.
+
+ Flush treats several "failure" cases as non-failures, essentially causing
+ them to start over. E_DEADLOCK is one example.
+ FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
+ properly rather than restarting, but there are a bunch of cases to audit.
+
+ We process not more than one meta-data and one data subvolume in one
+ jnode_flush() call. This is optimal, because file bodies in the tree are
+ sorted by subvolume IDs due to special key assignment policy (see method
+ ->build_body_key() of file plugin). We store pointer to the data subvolume
+ in flush position (field data_subv). We don't keep a track of meta-data
+ subvolume, as there is only single one per asymmetric LV (other LV types
+ are not supported for now).
+*/
+
+static int jnode_flush(jnode *node, long nr_to_write, long *nr_written,
+ flush_queue_t *fq, int flags)
+{
+ long ret = 0;
+ flush_scan *right_scan;
+ flush_scan *left_scan;
+ flush_pos_t *flush_pos;
+ struct super_block *sb;
+ reiser4_super_info_data *sbinfo;
+ jnode *leftmost_in_slum = NULL;
+
+ assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
+ assert("nikita-3022", reiser4_schedulable());
+ assert("nikita-3185",
+ get_current_super_private()->delete_mutex_owner != current);
+
+ /* allocate right_scan, left_scan and flush_pos */
+ right_scan =
+ kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
+ reiser4_ctx_gfp_mask_get());
+ if (right_scan == NULL)
+ return RETERR(-ENOMEM);
+ left_scan = right_scan + 1;
+ flush_pos = (flush_pos_t *) (left_scan + 1);
+
+ sb = reiser4_get_current_sb();
+ sbinfo = get_super_private(sb);
+
+ /* Flush-concurrency debug code */
+#if REISER4_DEBUG
+ atomic_inc(&flush_cnt);
+#endif
+
+ reiser4_enter_flush(sb);
+
+ /* Initialize a flush position. */
+ pos_init(flush_pos);
+
+ flush_pos->nr_written = nr_written;
+ flush_pos->fq = fq;
+ flush_pos->flags = flags;
+ flush_pos->nr_to_write = nr_to_write;
+
+ scan_init(right_scan, flush_pos);
+ scan_init(left_scan, flush_pos);
+
+ /* First scan left and remember the leftmost scan position. If the
+ leftmost position is unformatted we remember its parent_coord. We
+ scan until counting FLUSH_SCAN_MAXNODES.
+
+ If starting @node is unformatted, at the beginning of left scan its
+ parent (twig level node, containing extent item) will be long term
+ locked and lock handle will be stored in the
+ @right_scan->parent_lock. This lock is used to start the rightward
+ scan without redoing the tree traversal (necessary to find parent)
+ and, hence, is kept during leftward scan. As a result, we have to
+ use try-lock when taking long term locks during the leftward scan.
+ */
+ ret = scan_left(left_scan, right_scan, node);
+ if (ret != 0)
+ goto failed;
+ leftmost_in_slum = jref(left_scan->node);
+ scan_done(left_scan);
+
+ /* Then possibly go right to decide if we will use a policy of
+ relocating leaves. This is only done if we did not scan past (and
+ count) enough nodes during the leftward scan. If we do scan right,
+ we only care to go far enough to establish that at least
+ FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
+ limit is the difference between left_scan.count and the threshold. */
+
+ right_scan->max_count = left_scan->max_count - left_scan->count;
+ /*
+ * scan right is inherently deadlock prone, because we are
+ * (potentially) holding a lock on the twig node at this moment.
+ * FIXME: this is incorrect comment: lock is not held
+ */
+ if (right_scan->max_count > 0) {
+ ret = scan_right(right_scan, node);
+ if (ret != 0)
+ goto failed;
+ }
+ /*
+ * Only the right-scan count is needed, release any rightward locks
+ * right away
+ */
+ scan_done(right_scan);
+
+ /* Funny business here. We set the 'point' in the flush_position at
+ prior to starting squalloc regardless of whether the first point is
+ formatted or unformatted. Without this there would be an invariant,
+ in the rest of the code, that if the flush_position is unformatted
+ then flush_position->point is NULL and
+ flush_position->parent_{lock,coord} is set, and if the flush_position
+ is formatted then flush_position->point is non-NULL and no parent
+ info is set.
+
+ This seems lazy, but it makes the initial calls to
+ reverse_allocate (which ask "is it the pos->point the leftmost
+ child of its parent") much easier because we know the first child
+ already. Nothing is broken by this, but the reasoning is subtle.
+ Holding an extra reference on a jnode during flush can cause us to
+ see nodes with HEARD_BANSHEE during squalloc, because nodes are not
+ removed from sibling lists until they have zero reference count.
+ Flush would never observe a HEARD_BANSHEE node on the left-edge of
+ flush, nodes are only deleted to the right. So if nothing is broken,
+ why fix it?
+
+ NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
+ point and in any moment, because of the concurrent file system
+ activity (for example, truncate). */
+
+ /* Check jnode state after flush_scan completed. Having a lock on this
+ node or its parent (in case of unformatted) helps us in case of
+ concurrent flushing. */
+ if (jnode_check_flushprepped(leftmost_in_slum)
+ && !jnode_convertible(leftmost_in_slum)) {
+ ret = 0;
+ goto failed;
+ }
+
+ /* Now setup flush_pos using scan_left's endpoint. */
+ ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
+ if (ret)
+ goto failed;
+
+ if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
+ && node_is_empty(flush_pos->coord.node)) {
+ znode *empty = flush_pos->coord.node;
+
+ assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
+ ret = delete_empty_node(empty);
+ goto failed;
+ }
+
+ if (jnode_check_flushprepped(leftmost_in_slum)
+ && !jnode_convertible(leftmost_in_slum)) {
+ ret = 0;
+ goto failed;
+ }
+
+ /* Set pos->preceder and (re)allocate pos and its ancestors if it is
+ needed */
+ ret = alloc_pos_and_ancestors(flush_pos);
+ if (ret)
+ goto failed;
+
+ /* Do the main rightward-bottom-up squeeze and allocate loop. */
+ ret = squalloc(flush_pos);
+
+ pos_stop(flush_pos);
+ if (ret)
+ goto failed;
+
+ /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
+ children. First, the pos_stop() and pos_valid() routines should be
+ modified so that pos_stop() sets a flush_position->stop flag to 1
+ without releasing the current position immediately--instead release
+ it in pos_done(). This is a better implementation than the current
+ one anyway.
+
+ It is not clear that all fields of the flush_position should not be
+ released, but at the very least the parent_lock, parent_coord, and
+ parent_load should remain held because they are hold the last twig
+ when pos_stop() is called.
+
+ When we reach this point in the code, if the parent_coord is set to
+ after the last item then we know that flush reached the end of a twig
+ (and according to the new flush queueing design, we will return now).
+ If parent_coord is not past the last item, we should check if the
+ current twig has any unallocated children to the right (we are not
+ concerned with unallocated children to the left--in that case the
+ twig itself should not have been allocated). If the twig has
+ unallocated children to the right, set the parent_coord to that
+ position and then repeat the call to squalloc.
+
+ Testing for unallocated children may be defined in two ways: if any
+ internal item has a fake block number, it is unallocated; if any
+ extent item is unallocated then all of its children are unallocated.
+ But there is a more aggressive approach: if there are any dirty
+ children of the twig to the right of the current position, we may
+ wish to relocate those nodes now. Checking for potential relocation
+ is more expensive as it requires knowing whether there are any dirty
+ children that are not unallocated. The extent_needs_allocation should
+ be used after setting the correct preceder.
+
+ When we reach the end of a twig at this point in the code, if the
+ flush can continue (when the queue is ready) it will need some
+ information on the future starting point. That should be stored away
+ in the flush_handle using a seal, I believe. Holding a jref() on the
+ future starting point may break other code that deletes that node.
+ */
+
+ /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
+ called above the twig level. If the VM calls flush above the twig
+ level, do nothing and return (but figure out why this happens). The
+ txnmgr should be modified to only flush its leaf-level dirty list.
+ This will do all the necessary squeeze and allocate steps but leave
+ unallocated branches and possibly unallocated twigs (when the twig's
+ leftmost child is not dirty). After flushing the leaf level, the
+ remaining unallocated nodes should be given write-optimized
+ locations. (Possibly, the remaining unallocated twigs should be
+ allocated just before their leftmost child.)
+ */
+
+ /* Any failure reaches this point. */
+failed:
+
+ switch (ret) {
+ case -E_REPEAT:
+ case -EINVAL:
+ case -E_DEADLOCK:
+ case -E_NO_NEIGHBOR:
+ case -ENOENT:
+ /* FIXME(C): Except for E_DEADLOCK, these should probably be
+ handled properly in each case. They already are handled in
+ many cases. */
+ /* Something bad happened, but difficult to avoid... Try again!
+ */
+ ret = 0;
+ }
+
+ if (leftmost_in_slum)
+ jput(leftmost_in_slum);
+
+ pos_done(flush_pos);
+ scan_done(left_scan);
+ scan_done(right_scan);
+ kfree(right_scan);
+
+ ON_DEBUG(atomic_dec(&flush_cnt));
+
+ reiser4_leave_flush(sb);
+
+ return ret;
+}
+
+/*
+ * Reiser4 flush subsystem can be turned into "rapid flush mode" means that
+ * flusher should submit all prepped nodes immediately without keeping them in
+ * flush queues for long time. The reason for rapid flush mode is to free
+ * memory as fast as possible.
+ */
+
+#if REISER4_USE_RAPID_FLUSH
+/**
+ * submit all prepped nodes if rapid flush mode is set,
+ * turn rapid flush mode off.
+ */
+static int rapid_flush(flush_pos_t *pos)
+{
+ if (!wbq_available())
+ return 0;
+
+ return write_prepped_nodes(pos);
+}
+#else
+#define rapid_flush(pos) (0)
+#endif /* REISER4_USE_RAPID_FLUSH */
+
+static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom,
+ flush_queue_t *fq, int *nr_queued,
+ int flags)
+{
+ jnode * node;
+
+ if (start != NULL) {
+ spin_lock_jnode(start);
+ if (!jnode_is_flushprepped(start)) {
+ assert("zam-1056", start->atom == atom);
+ node = start;
+ goto enter;
+ }
+ spin_unlock_jnode(start);
+ }
+ /*
+ * In this loop we process all already prepped (RELOC or OVRWR) and
+ * dirtied again nodes. The atom spin lock is not released until all
+ * dirty nodes processed or not prepped node found in the atom dirty
+ * lists.
+ */
+ while ((node = find_first_dirty_jnode(atom, flags))) {
+ spin_lock_jnode(node);
+enter:
+ assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
+ assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
+
+ if (JF_ISSET(node, JNODE_WRITEBACK)) {
+ /* move node to the end of atom's writeback list */
+ list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
+
+ /*
+ * jnode is not necessarily on dirty list: if it was
+ * dirtied when it was on flush queue - it does not get
+ * moved to dirty list
+ */
+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
+ WB_LIST, 1));
+
+ } else if (jnode_is_znode(node)
+ && znode_above_root(JZNODE(node))) {
+ /*
+ * A special case for znode-above-root. The above-root
+ * (fake) znode is captured and dirtied when the tree
+ * height changes or when the root node is relocated.
+ * This causes atoms to fuse so that changes at the root
+ * are serialized. However, this node is never flushed.
+ * This special case used to be in lock.c to prevent the
+ * above-root node from ever being captured, but now
+ * that it is captured we simply prevent it from
+ * flushing. The log-writer code relies on this to
+ * properly log superblock modifications of the tree
+ * height.
+ */
+ jnode_make_wander_nolock(node);
+ } else if (JF_ISSET(node, JNODE_RELOC)) {
+ queue_jnode(fq, node);
+ ++(*nr_queued);
+ } else
+ break;
+
+ spin_unlock_jnode(node);
+ }
+ return node;
+}
+
+/**
+ * Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
+ * more nodes to flush, return 0 if atom's dirty lists empty and keep current
+ * atom locked, return other errors as they are.
+ */
+int flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
+ txn_atom ** atom, jnode *start)
+{
+ reiser4_super_info_data *sinfo = get_current_super_private();
+ flush_queue_t *fq = NULL;
+ jnode *node;
+ int nr_queued;
+ int ret;
+
+ assert("zam-889", atom != NULL && *atom != NULL);
+ assert_spin_locked(&((*atom)->alock));
+ assert("zam-892", get_current_context()->trans->atom == *atom);
+
+ BUG_ON(sb_rdonly(get_current_context()->super));
+
+ nr_to_write = LONG_MAX;
+ while (1) {
+ ret = reiser4_fq_by_atom(*atom, &fq);
+ if (ret != -E_REPEAT)
+ break;
+ *atom = get_current_atom_locked();
+ }
+ if (ret)
+ return ret;
+
+ assert_spin_locked(&((*atom)->alock));
+ /*
+ * parallel flushers limit
+ */
+ if (sinfo->tmgr.atom_max_flushers != 0) {
+ while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
+ /*
+ * An reiser4_atom_send_event() call is inside
+ * reiser4_fq_put_nolock() which is called when
+ * flush is finished and nr_flushers is decremented.
+ */
+ reiser4_atom_wait_event(*atom);
+ *atom = get_current_atom_locked();
+ }
+ }
+ /*
+ * count ourself as a flusher
+ */
+ (*atom)->nr_flushers++;
+
+ writeout_mode_enable();
+
+ nr_queued = 0;
+ node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
+
+ if (node == NULL) {
+ if (nr_queued == 0) {
+ (*atom)->nr_flushers--;
+ reiser4_fq_put_nolock(fq);
+ reiser4_atom_send_event(*atom);
+ /* current atom remains locked */
+ writeout_mode_disable();
+ return 0;
+ }
+ spin_unlock_atom(*atom);
+ } else {
+ jref(node);
+ assert("edward-2138", jnode_get_super(node) != NULL);
+ BUG_ON((*atom)->super != jnode_get_super(node));
+ spin_unlock_atom(*atom);
+ spin_unlock_jnode(node);
+ BUG_ON(nr_to_write == 0);
+ ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
+ jput(node);
+ }
+ ret = reiser4_write_fq(fq, nr_submitted,
+ WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
+
+ *atom = get_current_atom_locked();
+ (*atom)->nr_flushers--;
+ reiser4_fq_put_nolock(fq);
+ reiser4_atom_send_event(*atom);
+ spin_unlock_atom(*atom);
+
+ writeout_mode_disable();
+
+ if (ret == 0)
+ ret = -E_REPEAT;
+
+ return ret;
+}
+
+/**
+ * This function calls txmod->reverse_should_realloc_formatted() to make
+ * a reverse-parent-first relocation decision and then, if yes, it marks
+ * the parent dirty.
+ */
+static int check_parent_for_realloc(jnode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ int ret;
+ reiser4_subvol *subv = node->subvol;
+
+ if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
+ txmod_plugin *txmod_plug = txmod_plugin_by_id(subv->txmod);
+
+ if (!txmod_plug->reverse_should_realloc_formatted)
+ return 0;
+ ret = txmod_plug->reverse_should_realloc_formatted(node,
+ parent_coord,
+ pos);
+ if (ret < 0)
+ return ret;
+ /*
+ * FIXME-ZAM: if parent is already relocated -
+ * we do not want to grab space, right?
+ */
+ if (ret == 1) {
+ int grabbed;
+
+ grabbed = ctx_subvol_grabbed(get_current_context(),
+ subv->id);
+ if (reiser4_grab_space_force((__u64) 1,
+ BA_RESERVED, subv) != 0)
+ reiser4_panic("umka-1250",
+ "No space left during flush.");
+ assert("jmacd-18923",
+ znode_is_write_locked(parent_coord->node));
+ znode_make_dirty(parent_coord->node);
+ grabbed2free_mark(grabbed, subv);
+ }
+ }
+ return 0;
+}
+
+/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
+ FORWARD PARENT-FIRST LOOP BEGINS) */
+
+/* Get the leftmost child for given coord. */
+static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child)
+{
+ int ret;
+
+ ret = item_utmost_child(coord, LEFT_SIDE, child);
+
+ if (ret)
+ return ret;
+
+ if (IS_ERR(*child))
+ return PTR_ERR(*child);
+
+ return 0;
+}
+
+/* This step occurs after the left- and right-scans are completed, before
+ starting the forward parent-first traversal. Here we attempt to allocate
+ ancestors of the starting flush point, which means continuing in the reverse
+ parent-first direction to the parent, grandparent, and so on (as long as the
+ child is a leftmost child). This routine calls a recursive process,
+ alloc_one_ancestor, which does the real work, except there is special-case
+ handling here for the first ancestor, which may be a twig. At each level
+ (here and alloc_one_ancestor), we check for relocation and then, if the child
+ is a leftmost child, repeat at the next level. On the way back down (the
+ recursion), we allocate the ancestors in parent-first order. */
+static int alloc_pos_and_ancestors(flush_pos_t *pos)
+{
+ int ret = 0;
+ lock_handle plock;
+ load_count pload;
+ coord_t pcoord;
+
+ if (znode_check_flushprepped(pos->lock.node))
+ return 0;
+
+ coord_init_invalid(&pcoord, NULL);
+ init_lh(&plock);
+ init_load_count(&pload);
+
+ if (pos->state == POS_ON_EPOINT) {
+ /* a special case for pos on twig level, where we already have
+ a lock on parent node. */
+ /* The parent may not be dirty, in which case we should decide
+ whether to relocate the child now. If decision is made to
+ relocate the child, the parent is marked dirty. */
+ ret = check_parent_for_realloc(pos->child, &pos->coord, pos);
+ if (ret)
+ goto exit;
+
+ /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
+ is leftmost) and the leaf/child, so recursion is not needed.
+ Levels above the twig will be allocated for
+ write-optimization before the transaction commits. */
+
+ /* Do the recursive step, allocating zero or more of our
+ * ancestors. */
+ ret = alloc_one_ancestor(&pos->coord, pos);
+ } else {
+ if (!znode_is_root(pos->lock.node)) {
+ /*
+ * all formatted nodes except tree root
+ */
+ ret = reiser4_get_parent(&plock, pos->lock.node,
+ ZNODE_WRITE_LOCK);
+ if (ret)
+ goto exit;
+
+ ret = incr_load_count_znode(&pload, plock.node);
+ if (ret)
+ goto exit;
+
+ ret = find_child_ptr(plock.node, pos->lock.node, &pcoord);
+ if (ret)
+ goto exit;
+
+ ret = check_parent_for_realloc(ZJNODE(pos->lock.node),
+ &pcoord,
+ pos);
+ if (ret)
+ goto exit;
+
+ ret = alloc_one_ancestor(&pcoord, pos);
+ if (ret)
+ goto exit;
+ }
+ ret = allocate_znode(pos->lock.node, &pcoord, pos);
+ }
+exit:
+ done_load_count(&pload);
+ done_lh(&plock);
+ return ret;
+}
+
+/* This is the recursive step described in alloc_pos_and_ancestors, above.
+ Ignoring the call to find_preceder, which is the next function described, this
+ checks if the child is a leftmost child and returns if it is not. If the
+ child is a leftmost child it checks for relocation, possibly dirtying the
+ parent. Then it performs the recursive step. */
+static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos)
+{
+ int ret = 0;
+ lock_handle alock;
+ load_count aload;
+ coord_t acoord;
+ /*
+ * As we ascend at the left-edge of the region to flush, take this
+ * opportunity at the twig level to find our parent-first preceder.
+ * This precider will be used to allocate the formatted node we will
+ * start descending from. Other nodes on the way back will be allocated
+ * by using the updated preceder.
+ */
+ if (pos->mfbi.preceder.blk == 0) {
+ ret = find_preceder(coord, &pos->mfbi);
+ if (ret != 0)
+ return ret;
+ }
+ /*
+ * If the ancestor is clean or already allocated, or
+ * if the child is not a leftmost child, stop going up,
+ * even leaving coord->node not flushprepped
+ */
+ if (znode_check_flushprepped(coord->node) ||
+ !coord_is_leftmost_unit(coord))
+ return 0;
+
+ init_lh(&alock);
+ init_load_count(&aload);
+ coord_init_invalid(&acoord, NULL);
+
+ /* Only ascend to the next level if it is a leftmost child, but
+ write-lock the parent in case we will relocate the child. */
+ if (!znode_is_root(coord->node)) {
+
+ ret =
+ jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
+ &alock, &aload, ZNODE_WRITE_LOCK,
+ 0);
+ if (ret != 0) {
+ /* FIXME(C): check EINVAL, E_DEADLOCK */
+ goto exit;
+ }
+
+ ret = check_parent_for_realloc(ZJNODE(coord->node),
+ &acoord, pos);
+ if (ret != 0)
+ goto exit;
+
+ /* Recursive call. */
+ if (!znode_check_flushprepped(acoord.node)) {
+ ret = alloc_one_ancestor(&acoord, pos);
+ if (ret)
+ goto exit;
+ }
+ }
+
+ /* Note: we call allocate with the parent write-locked (except at the
+ root) in case we relocate the child, in which case it will modify the
+ parent during this call. */
+ ret = allocate_znode(coord->node, &acoord, pos);
+
+exit:
+ done_load_count(&aload);
+ done_lh(&alock);
+ return ret;
+}
+
+/* During the reverse parent-first alloc_pos_and_ancestors process described
+ above there is a call to this function at the twig level. During
+ alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
+ parent-first context)? We repeat this process as long as the child is the
+ leftmost child, eventually reaching an ancestor of the flush point that is
+ not a leftmost child. The preceder of that ancestors, which is not a leftmost
+ child, is actually on the leaf level. The preceder of that block is the
+ left-neighbor of the flush point. The preceder of that block is the rightmost
+ child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
+ through the twig level, it stops momentarily to remember the block of the
+ rightmost child of the twig on the left and sets it to the flush_position's
+ preceder_hint.
+
+ There is one other place where we may set the flush_position's preceder hint,
+ which is during scan-left.
+*/
+static int find_preceder(const coord_t *coord_in, flush_brick_info *fbi)
+{
+ int ret;
+ coord_t coord;
+ lock_handle left_lock;
+ load_count left_load;
+ reiser4_block_nr blk;
+
+ coord_dup(&coord, coord_in);
+
+ init_lh(&left_lock);
+ init_load_count(&left_load);
+ /*
+ * FIXME(B): Same FIXME as in "Find the preceder" in
+ * reverse_allocate. coord_is_leftmost_unit is not the
+ * right test if the unformatted child is in the middle
+ * of the first extent unit
+ */
+ if (!coord_is_leftmost_unit(&coord))
+ coord_prev_unit(&coord);
+ else {
+ ret = reiser4_get_left_neighbor(&left_lock, coord.node,
+ ZNODE_READ_LOCK,
+ GN_SAME_ATOM);
+ if (ret) {
+ /*
+ * If we fail for any reason it doesn't matter because
+ * the preceder is only a hint. We are low-priority at
+ * this point, so this must be the case
+ */
+ if (ret == -E_REPEAT ||
+ ret == -E_NO_NEIGHBOR ||
+ ret == -ENOENT ||
+ ret == -EINVAL ||
+ ret == -E_DEADLOCK)
+ ret = 0;
+ goto exit;
+ }
+ ret = incr_load_count_znode(&left_load, left_lock.node);
+ if (ret)
+ goto exit;
+
+ coord_init_last_unit(&coord, left_lock.node);
+ }
+ assert("edward-1849",
+ item_is_extent(&coord) || item_is_internal(&coord));
+
+ ret = item_utmost_child_real_block(&coord, RIGHT_SIDE, &blk);
+ if (ret)
+ goto exit;
+ if (find_data_subvol(&coord) != get_meta_subvol())
+ /*
+ * We are looking for a preceder for a formatted node
+ * on the twig level (that is in meta-data brick), whereas
+ * previous extent at @coord points out to different brick,
+ * so preceder not found.
+ */
+ goto exit;
+ fbi->preceder.blk = blk;
+ check_preceder(blk, get_meta_subvol());
+ exit:
+ done_load_count(&left_load);
+ done_lh(&left_lock);
+ return ret;
+}
+
+int shift_extent_left_complete(coord_t *to, reiser4_key *to_key, znode *left);
+
+/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
+
+/* This procedure implements the outer loop of the flush algorithm. To put this
+ in context, here is the general list of steps taken by the flush routine as a
+ whole:
+
+ 1. Scan-left
+ 2. Scan-right (maybe)
+ 3. Allocate initial flush position and its ancestors
+ 4. <handle extents>
+ 5. <squeeze and next position and its ancestors to-the-right,
+ then update position to-the-right>
+ 6. <repeat from #4 until flush is stopped>
+
+ This procedure implements the loop in steps 4 through 6 in the above listing.
+
+ Step 4: if the current flush position is an extent item (position on the twig
+ level), it allocates the extent (allocate_extent_item_in_place) then shifts
+ to the next coordinate. If the next coordinate's leftmost child needs
+ flushprep, we will continue. If the next coordinate is an internal item, we
+ descend back to the leaf level, otherwise we repeat a step #4 (labeled
+ ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
+ twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
+ next (right) twig, prior to step #5 which moves to the right.
+
+ Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
+ the tree to allocate any ancestors of the next-right flush position that are
+ not also ancestors of the current position. Those ancestors (in top-down
+ order) are the next in parent-first order. We squeeze adjacent nodes on the
+ way up until the right node and current node share the same parent, then
+ allocate on the way back down. Finally, this step sets the flush position to
+ the next-right node. Then repeat steps 4 and 5.
+*/
+
+/* SQUEEZE CODE */
+
+/* Copy as much of the leading extents from @right to @left, allocating
+ unallocated extents as they are copied. Returns SQUEEZE_TARGET_FULL or
+ SQUEEZE_SOURCE_EMPTY when no more can be shifted. If the next item is an
+ internal item it calls shift_one_internal_unit and may then return
+ SUBTREE_MOVED. */
+
+static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos)
+{
+ int ret = SUBTREE_MOVED;
+ coord_t coord; /* used to iterate over items */
+ reiser4_key stop_key;
+ reiser4_tree *tree;
+ reiser4_subvol *subv;
+ txmod_plugin *txmod_plug;
+
+ assert("jmacd-2008", !node_is_empty(right));
+ assert("edward-1728", ZJNODE(right)->subvol == ZJNODE(left)->subvol);
+
+ subv = get_meta_subvol();
+ txmod_plug = txmod_plugin_by_id(subv->txmod);
+ coord_init_first_unit(&coord, right);
+ /*
+ * FIXME: can be optimized to cut once
+ */
+ while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
+ /*
+ * ON_DEBUG(void *vp); FIXME-EDWARD: that shift check
+ * leads to false positives
+ */
+ assert("vs-1468", coord_is_leftmost_unit(&coord));
+ //ON_DEBUG(vp = shift_check_prepare(left, coord.node));
+ /*
+ * Allocate one extent (a unit of reiser4 extent item)
+ * in "squeeze context" and append it to the @left.
+ * stop_key is used to find what was copied and what
+ * to cut
+ */
+ stop_key = *reiser4_min_key();
+ ret = txmod_plug->squeeze_alloc_unformatted(left,
+ &coord, pos,
+ &stop_key);
+ if (ret != SQUEEZE_CONTINUE) {
+ //ON_DEBUG(kfree(vp));
+ break;
+ }
+ assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
+ /*
+ * cut the original units from @right (to complete shifting)
+ */
+ set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
+ check_me("vs-1466",
+ shift_extent_left_complete(&coord, &stop_key, left) == 0);
+
+ //ON_DEBUG(shift_check(vp, left, coord.node));
+ }
+ /*
+ * @left and @right nodes participated in the
+ * implicit shift, determined by the pair of
+ * functions:
+ * . squeeze_alloc_unformatted() - copy unit from @right to @left
+ * . shift_extent_left_complete() - cut the original unit from @right
+ * so update their delimiting keys
+ */
+ tree = znode_get_tree(left);
+ write_lock_dk(tree);
+ update_znode_dkeys(left, right);
+ write_unlock_dk(tree);
+
+ if (node_is_empty(coord.node))
+ ret = SQUEEZE_SOURCE_EMPTY;
+
+ if (ret == SQUEEZE_TARGET_FULL)
+ goto out;
+
+ if (node_is_empty(right)) {
+ /* The whole right node was copied into @left. */
+ assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
+ goto out;
+ }
+
+ coord_init_first_unit(&coord, right);
+
+ if (!item_is_internal(&coord)) {
+ /* we do not want to squeeze anything else to left neighbor
+ because "slum" is over */
+ ret = SQUEEZE_TARGET_FULL;
+ goto out;
+ }
+ assert("jmacd-433", item_is_internal(&coord));
+
+ /* Shift an internal unit. The child must be allocated before shifting
+ any more extents, so we stop here. */
+ ret = shift_one_internal_unit(left, right);
+
+out:
+ assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
+ || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
+
+ if (ret == SQUEEZE_TARGET_FULL) {
+ /* We submit prepped nodes here and expect that this @left twig
+ * will not be modified again during this jnode_flush() call. */
+ int ret1;
+
+ /* NOTE: seems like io is done under long term locks. */
+ ret1 = write_prepped_nodes(pos);
+ if (ret1 < 0)
+ return ret1;
+ }
+
+ return ret;
+}
+
+#if REISER4_DEBUG
+static void item_convert_invariant(flush_pos_t *pos)
+{
+ assert("edward-1225", coord_is_existing_item(&pos->coord));
+ if (convert_data_attached(pos)) {
+ item_plugin *iplug = item_convert_plug(pos);
+
+ assert("edward-1000",
+ iplug == item_plugin_by_coord(&pos->coord));
+ assert("edward-1001", iplug->f.convert != NULL);
+ } else
+ assert("edward-1226", pos->child == NULL);
+}
+#else
+
+#define item_convert_invariant(pos) noop
+
+#endif
+
+/*
+ * Scan all node's items and apply for each one
+ * its ->convert() method. This method may:
+ * . resize the item;
+ * . kill the item;
+ * . insert a group of items/nodes on the right,
+ * which possess the following properties:
+ * . all new nodes are dirty and not convertible;
+ * . for all new items ->convert() method is a noop.
+ *
+ * NOTE: this function makes the tree unbalanced!
+ * This intended to be used by flush squalloc() in a
+ * combination with squeeze procedure.
+ *
+ * GLOSSARY
+ *
+ * Chained nodes and items.
+ * Two neighboring nodes @left and @right are chained,
+ * iff the last item of @left and the first item of @right
+ * belong to the same item cluster. In this case those
+ * items are called chained.
+ */
+static int convert_node(flush_pos_t *pos, znode * node)
+{
+ int ret = 0;
+ item_plugin *iplug;
+ assert("edward-304", pos != NULL);
+ assert("edward-305", pos->child == NULL);
+ assert("edward-475", znode_convertible(node));
+ assert("edward-669", znode_is_wlocked(node));
+ assert("edward-1210", !node_is_empty(node));
+
+ if (znode_get_level(node) != LEAF_LEVEL)
+ /* unsupported */
+ goto exit;
+
+ coord_init_first_unit(&pos->coord, node);
+
+ while (1) {
+ ret = 0;
+ coord_set_to_left(&pos->coord);
+ item_convert_invariant(pos);
+
+ iplug = item_plugin_by_coord(&pos->coord);
+ assert("edward-844", iplug != NULL);
+
+ if (iplug->f.convert) {
+ ret = iplug->f.convert(pos);
+ if (ret)
+ goto exit;
+ }
+ assert("edward-307", pos->child == NULL);
+
+ if (coord_next_item(&pos->coord)) {
+ /*
+ * node is over
+ */
+ if (convert_data_attached(pos))
+ /*
+ * the last item was convertible and
+ * there still is an unprocesssed flow
+ */
+ if (next_node_is_chained(pos)) {
+ /*
+ * next node contains items of
+ * the same disk cluster,
+ * so finish with this node
+ */
+ update_chaining_state(pos, 0/* move
+ to next
+ node */);
+ break;
+ }
+ else {
+ /*
+ * perform one more iteration
+ * for the same item and the
+ * rest of flow
+ */
+ update_chaining_state(pos, 1/* this
+ node */);
+ }
+ else
+ /*
+ * the last item wasn't convertible, or
+ * convert date was detached in the last
+ * iteration,
+ * go to next node
+ */
+ break;
+ } else {
+ /*
+ * Node is not over, item position got decremented.
+ */
+ if (convert_data_attached(pos)) {
+ /*
+ * disk cluster should be increased, so roll
+ * one item position back and perform the
+ * iteration with the previous item and the
+ * rest of attached data
+ */
+ if (iplug != item_plugin_by_coord(&pos->coord))
+ set_item_convert_count(pos, 0);
+
+ ret = coord_prev_item(&pos->coord);
+ assert("edward-1003", !ret);
+
+ update_chaining_state(pos, 1/* this node */);
+ }
+ else
+ /*
+ * previous item was't convertible, or
+ * convert date was detached in the last
+ * iteration, go to next item
+ */
+ ;
+ }
+ }
+ JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
+ znode_make_dirty(node);
+exit:
+ assert("edward-1004", !ret);
+ return ret;
+}
+
+/* Squeeze and allocate the right neighbor. This is called after @left and
+ its current children have been squeezed and allocated already. This
+ procedure's job is to squeeze and items from @right to @left.
+
+ If at the leaf level, use the shift_everything_left memcpy-optimized
+ version of shifting (squeeze_right_leaf).
+
+ If at the twig level, extents are allocated as they are shifted from @right
+ to @left (squalloc_right_twig).
+
+ At any other level, shift one internal item and return to the caller
+ (squalloc_parent_first) so that the shifted-subtree can be processed in
+ parent-first order.
+
+ When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
+ returned. When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
+ returned. If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
+ is returned.
+*/
+
+static int squeeze_right_neighbor(flush_pos_t *pos, znode * left,
+ znode * right)
+{
+ int ret;
+
+ /* FIXME it is possible to see empty hasn't-heard-banshee node in a
+ * tree owing to error (for example, ENOSPC) in write */
+ /* assert("jmacd-9321", !node_is_empty(left)); */
+ assert("jmacd-9322", !node_is_empty(right));
+ assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
+
+ switch (znode_get_level(left)) {
+ case TWIG_LEVEL:
+ /* Shift with extent allocating until either an internal item
+ is encountered or everything is shifted or no free space
+ left in @left */
+ ret = squeeze_right_twig(left, right, pos);
+ break;
+
+ default:
+ /* All other levels can use shift_everything until we implement
+ per-item flush plugins. */
+ ret = squeeze_right_non_twig(left, right);
+ break;
+ }
+
+ assert("jmacd-2011", (ret < 0 ||
+ ret == SQUEEZE_SOURCE_EMPTY
+ || ret == SQUEEZE_TARGET_FULL
+ || ret == SUBTREE_MOVED));
+ return ret;
+}
+
+static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos,
+ znode * right)
+{
+ int ret;
+
+ ret = squeeze_right_twig(pos->lock.node, right, pos);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ coord_init_after_last_item(&pos->coord, pos->lock.node);
+ return ret;
+ }
+
+ coord_init_last_unit(&pos->coord, pos->lock.node);
+ return 0;
+}
+
+/* do a fast check for "same parents" condition before calling
+ * squalloc_upper_levels() */
+static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos,
+ znode * left,
+ znode * right)
+{
+ if (znode_same_parents(left, right))
+ return 0;
+
+ return squalloc_upper_levels(pos, left, right);
+}
+
+/* Check whether the parent of given @right node needs to be processes
+ ((re)allocated) prior to processing of the child. If @left and @right do not
+ share at least the parent of the @right is after the @left but before the
+ @right in parent-first order, we have to (re)allocate it before the @right
+ gets (re)allocated. */
+static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right)
+{
+ int ret;
+
+ lock_handle left_parent_lock;
+ lock_handle right_parent_lock;
+
+ load_count left_parent_load;
+ load_count right_parent_load;
+
+ init_lh(&left_parent_lock);
+ init_lh(&right_parent_lock);
+
+ init_load_count(&left_parent_load);
+ init_load_count(&right_parent_load);
+
+ ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
+ if (ret)
+ goto out;
+
+ ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
+ if (ret)
+ goto out;
+
+ /* Check for same parents */
+ if (left_parent_lock.node == right_parent_lock.node)
+ goto out;
+
+ if (znode_check_flushprepped(right_parent_lock.node)) {
+ /*
+ * Keep parent-first order. In the order, the right parent node
+ * stands before the @right node. If it is already allocated,
+ * we set the preceder (next block search start point) to its
+ * block number, @right node should be allocated after it.
+ *
+ * However, preceder is set only if the right parent is on twig
+ * level. The explanation is the following: new branch nodes are
+ * allocated over already allocated children while the tree
+ * grows, it is difficult to keep tree ordered, we assume that
+ * only leaves and twings are correctly allocated. So, only
+ * twigs are used as a preceder for allocating of the rest of
+ * the slum
+ */
+ if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL)
+ fbi_update_preceder(&pos->mfbi,
+ *znode_get_block(right_parent_lock.node));
+ goto out;
+ }
+ ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
+ if (ret)
+ goto out;
+
+ ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
+ if (ret)
+ goto out;
+
+ ret = squeeze_right_neighbor(pos, left_parent_lock.node,
+ right_parent_lock.node);
+ /*
+ * We stop if error. We stop if some items/units were shifted (ret == 0)
+ * and thus @right changed its parent. It means we have not process
+ * right_parent node prior to processing of @right. Positive return
+ * values say that shifting items was not happen because of "empty
+ * source" or "target full" conditions.
+ */
+ if (ret <= 0)
+ goto out;
+
+ /* parent(@left) and parent(@right) may have different parents also. We
+ * do a recursive call for checking that. */
+
+ ret = check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
+ right_parent_lock.node);
+ if (ret)
+ goto out;
+
+ /* allocate znode when going down */
+ ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
+
+out:
+ done_load_count(&left_parent_load);
+ done_load_count(&right_parent_load);
+
+ done_lh(&left_parent_lock);
+ done_lh(&right_parent_lock);
+
+ return ret;
+}
+
+/* Check the leftmost child "flushprepped" status, also returns true if child
+ * node was not found in cache. */
+static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord)
+{
+ int ret;
+ int prepped;
+
+ jnode *child;
+
+ ret = get_leftmost_child_of_unit(coord, &child);
+
+ if (ret)
+ return ret;
+
+ if (child) {
+ prepped = jnode_check_flushprepped(child);
+ jput(child);
+ } else {
+ /* We consider not existing child as a node which slum
+ processing should not continue to. Not cached node is clean,
+ so it is flushprepped. */
+ prepped = 1;
+ }
+
+ return prepped;
+}
+
+/* (re)allocate znode with automated getting parent node */
+static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos)
+{
+ int ret;
+ lock_handle parent_lock;
+ load_count parent_load;
+ coord_t pcoord;
+
+ assert("zam-851", znode_is_write_locked(node));
+
+ init_lh(&parent_lock);
+ init_load_count(&parent_load);
+
+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
+ if (ret)
+ goto out;
+
+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
+ if (ret)
+ goto out;
+
+ ret = find_child_ptr(parent_lock.node, node, &pcoord);
+ if (ret)
+ goto out;
+
+ ret = allocate_znode(node, &pcoord, pos);
+
+out:
+ done_load_count(&parent_load);
+ done_lh(&parent_lock);
+ return ret;
+}
+
+/*
+ * Process nodes on the leaf level until unformatted node or
+ * rightmost node in the slum reached.
+ *
+ * This function is a complicated beast, because it calls a
+ * static machine ->convert_node() for every node, which, in
+ * turn, scans node's items and does something for each of them.
+ */
+static int handle_pos_on_formatted(flush_pos_t *pos)
+{
+ int ret;
+ lock_handle right_lock;
+ load_count right_load;
+
+ init_lh(&right_lock);
+ init_load_count(&right_load);
+
+ if (znode_convertible(pos->lock.node)) {
+ ret = convert_node(pos, pos->lock.node);
+ if (ret)
+ return ret;
+ }
+ while (1) {
+ assert("edward-1635",
+ ergo(node_is_empty(pos->lock.node),
+ ZF_ISSET(pos->lock.node, JNODE_HEARD_BANSHEE)));
+ /*
+ * First of all, grab a right neighbor
+ */
+ if (convert_data(pos) && convert_data(pos)->right_locked) {
+ /*
+ * the right neighbor was locked by convert_node()
+ * transfer the lock from the "cache".
+ */
+ move_lh(&right_lock, &convert_data(pos)->right_lock);
+ done_lh(&convert_data(pos)->right_lock);
+ convert_data(pos)->right_locked = 0;
+ }
+ else {
+ ret = neighbor_in_slum(pos->lock.node, &right_lock,
+ RIGHT_SIDE, ZNODE_WRITE_LOCK,
+ 1, 0);
+ if (ret) {
+ /*
+ * There is no right neighbor for some reasons,
+ * so finish with this level.
+ */
+ assert("edward-1636",
+ !should_convert_right_neighbor(pos));
+ break;
+ }
+ }
+ /*
+ * Check "flushprepped" status of the right neighbor.
+ *
+ * We don't prep(allocate) nodes for flushing twice. This can be
+ * suboptimal, or it can be optimal. For now we choose to live
+ * with the risk that it will be suboptimal because it would be
+ * quite complex to code it to be smarter.
+ */
+ if (znode_check_flushprepped(right_lock.node)
+ && !znode_convertible(right_lock.node)) {
+ assert("edward-1005",
+ !should_convert_right_neighbor(pos));
+ pos_stop(pos);
+ break;
+ }
+ ret = incr_load_count_znode(&right_load, right_lock.node);
+ if (ret)
+ break;
+ if (znode_convertible(right_lock.node)) {
+ assert("edward-1643",
+ ergo(convert_data(pos),
+ convert_data(pos)->right_locked == 0));
+
+ ret = convert_node(pos, right_lock.node);
+ if (ret)
+ break;
+ }
+ else
+ assert("edward-1637",
+ !should_convert_right_neighbor(pos));
+
+ if (node_is_empty(pos->lock.node)) {
+ /*
+ * Current node became empty after conversion
+ * and, hence, was removed from the tree;
+ * Advance the current position to the right neighbor.
+ */
+ assert("edward-1638",
+ ZF_ISSET(pos->lock.node, JNODE_HEARD_BANSHEE));
+ move_flush_pos(pos, &right_lock, &right_load, NULL);
+ continue;
+ }
+ if (node_is_empty(right_lock.node)) {
+ assert("edward-1639",
+ ZF_ISSET(right_lock.node, JNODE_HEARD_BANSHEE));
+ /*
+ * The right neighbor became empty after
+ * convertion, and hence it was deleted
+ * from the tree - skip this.
+ * Since current node is not empty,
+ * we'll obtain a correct pointer to
+ * the next right neighbor
+ */
+ done_load_count(&right_load);
+ done_lh(&right_lock);
+ continue;
+ }
+ /*
+ * At this point both, current node and its right
+ * neigbor are converted and not empty.
+ * Squeeze them _before_ going upward.
+ */
+ ret = squeeze_right_neighbor(pos, pos->lock.node,
+ right_lock.node);
+ if (ret < 0)
+ break;
+ if (node_is_empty(right_lock.node)) {
+ assert("edward-1640",
+ ZF_ISSET(right_lock.node, JNODE_HEARD_BANSHEE));
+ /*
+ * right neighbor was squeezed completely,
+ * and hence has been deleted from the tree.
+ * Skip this.
+ */
+ done_load_count(&right_load);
+ done_lh(&right_lock);
+ continue;
+ }
+ if (znode_check_flushprepped(right_lock.node)) {
+ if (should_convert_right_neighbor(pos)) {
+ /*
+ * in spite of flushprepped status of the node,
+ * its right slum neighbor should be converted
+ */
+ assert("edward-953", convert_data(pos));
+ assert("edward-954", item_convert_data(pos));
+
+ move_flush_pos(pos, &right_lock, &right_load, NULL);
+ continue;
+ } else {
+ pos_stop(pos);
+ break;
+ }
+ }
+ /*
+ * parent(right_lock.node) has to be processed before
+ * (right_lock.node) due to "parent-first" allocation
+ * order
+ */
+ ret = check_parents_and_squalloc_upper_levels(pos,
+ pos->lock.node,
+ right_lock.node);
+ if (ret)
+ break;
+ /*
+ * (re)allocate _after_ going upward
+ */
+ ret = lock_parent_and_allocate_znode(right_lock.node, pos);
+ if (ret)
+ break;
+ if (should_terminate_squalloc(pos)) {
+ set_item_convert_count(pos, 0);
+ break;
+ }
+ /*
+ * advance the flush position to the right neighbor
+ */
+ move_flush_pos(pos, &right_lock, &right_load, NULL);
+
+ ret = rapid_flush(pos);
+ if (ret)
+ break;
+ }
+ check_convert_info(pos);
+ done_load_count(&right_load);
+ done_lh(&right_lock);
+ /*
+ * This function indicates via pos whether to stop or go to twig or
+ * continue on current level
+ */
+ return ret;
+
+}
+
+/* Process nodes on leaf level until unformatted node or rightmost node in the
+ * slum reached. */
+static int handle_pos_on_leaf(flush_pos_t *pos)
+{
+ int ret;
+
+ assert("zam-845", pos->state == POS_ON_LEAF);
+
+ ret = handle_pos_on_formatted(pos);
+
+ if (ret == -E_NO_NEIGHBOR) {
+ /* cannot get right neighbor, go process extents. */
+ pos->state = POS_TO_TWIG;
+ return 0;
+ }
+
+ return ret;
+}
+
+/* Process slum on level > 1 */
+static int handle_pos_on_internal(flush_pos_t *pos)
+{
+ assert("zam-850", pos->state == POS_ON_INTERNAL);
+ return handle_pos_on_formatted(pos);
+}
+
+/* check whether squalloc should stop before processing given extent */
+static int squalloc_extent_should_stop(flush_pos_t *pos)
+{
+ assert("zam-869", item_is_extent(&pos->coord));
+
+ /* pos->child is a jnode handle_pos_on_extent() should start with in
+ * stead of the first child of the first extent unit. */
+ if (pos->child) {
+ int prepped;
+
+ assert("vs-1383", jnode_is_unformatted(pos->child));
+ prepped = jnode_check_flushprepped(pos->child);
+ pos->pos_in_unit =
+ jnode_get_index(pos->child) -
+ extent_unit_index(&pos->coord);
+ assert("vs-1470",
+ pos->pos_in_unit < extent_unit_width(&pos->coord));
+ assert("nikita-3434",
+ ergo(extent_is_unallocated(&pos->coord),
+ pos->pos_in_unit == 0));
+ jput(pos->child);
+ pos->child = NULL;
+
+ return prepped;
+ }
+
+ pos->pos_in_unit = 0;
+ if (extent_is_unallocated(&pos->coord))
+ return 0;
+
+ return leftmost_child_of_unit_check_flushprepped(&pos->coord);
+}
+
+/* Handle the case when regular reiser4 tree (znodes connected one to its
+ * neighbors by sibling pointers) is interrupted on leaf level by one or more
+ * unformatted nodes. By having a lock on twig level and use extent code
+ * routines to process unformatted nodes we swim around an irregular part of
+ * reiser4 tree. */
+static int handle_pos_on_twig(flush_pos_t *pos)
+{
+ int ret;
+ reiser4_subvol *subv;
+ txmod_plugin *txmod_plug;
+
+ assert("zam-844", pos->state == POS_ON_EPOINT);
+ assert("zam-843", item_is_extent(&pos->coord));
+
+ subv = find_data_subvol(&pos->coord);
+ txmod_plug = txmod_plugin_by_id(subv->txmod);
+
+ /* We decide should we continue slum processing with current extent
+ unit: if leftmost child of current extent unit is flushprepped
+ (i.e. clean or already processed by flush) we stop squalloc(). There
+ is a fast check for unallocated extents which we assume contain all
+ not flushprepped nodes. */
+ /* FIXME: Here we implement simple check, we are only looking on the
+ leftmost child. */
+ ret = squalloc_extent_should_stop(pos);
+ if (ret != 0) {
+ pos_stop(pos);
+ return ret;
+ }
+ /*
+ * loop on the whole connected set of extents
+ */
+ while (pos_valid(pos) && coord_is_existing_unit(&pos->coord) &&
+ item_is_extent(&pos->coord)) {
+ ret = txmod_plug->forward_alloc_unformatted(pos);
+ if (ret)
+ break;
+ coord_next_unit(&pos->coord);
+ }
+ if (coord_is_after_rightmost(&pos->coord)) {
+ pos->state = POS_END_OF_TWIG;
+ return 0;
+ }
+ if (item_is_internal(&pos->coord)) {
+ pos->state = POS_TO_LEAF;
+ return 0;
+ }
+
+ assert("zam-860", item_is_extent(&pos->coord));
+
+ /* "slum" is over */
+ pos->state = POS_INVALID;
+ return 0;
+}
+
+/**
+ * When we about to return flush position from twig to leaf level
+ * we can process the right twig node or move position to the leaf.
+ * This function processes right twig if it is possible and jumps
+ * to leaf level if not
+ */
+static int handle_pos_end_of_twig(flush_pos_t *pos)
+{
+ int ret;
+ lock_handle right_lock;
+ load_count right_load;
+ coord_t at_right;
+ jnode *child = NULL;
+
+ assert("zam-848", pos->state == POS_END_OF_TWIG);
+ assert("zam-849", coord_is_after_rightmost(&pos->coord));
+
+ init_lh(&right_lock);
+ init_load_count(&right_load);
+ /*
+ * We get a lock on the right twig node even it is not dirty because
+ * slum continues or discontinues on leaf level not on next twig. This
+ * lock on the right twig is needed for getting its leftmost child
+ */
+ ret = reiser4_get_right_neighbor(&right_lock, pos->lock.node,
+ ZNODE_WRITE_LOCK, GN_SAME_ATOM);
+ if (ret)
+ goto out;
+
+ ret = incr_load_count_znode(&right_load, right_lock.node);
+ if (ret)
+ goto out;
+
+ coord_init_first_unit(&at_right, right_lock.node);
+
+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
+ /*
+ * If right twig node is dirty we always attempt to squeeze it
+ * content to the left...
+ */
+ became_dirty:
+ ret = squeeze_right_twig_and_advance_coord(pos, right_lock.node);
+ if (ret <= 0) {
+ /*
+ * pos->coord is on internal item, go to leaf level, or
+ * we have an error which will be caught in squalloc()
+ */
+ pos->state = POS_TO_LEAF;
+ goto out;
+ }
+ /*
+ * If right twig was squeezed completely we have to re-lock
+ * right twig. Now it is done through the top-level squalloc
+ * routine
+ */
+ if (node_is_empty(right_lock.node))
+ goto out;
+ /*
+ * ... and prep it if it is not yet prepped
+ */
+ if (!znode_check_flushprepped(right_lock.node)) {
+ /*
+ * As usual, process parent before ...
+ */
+ ret = check_parents_and_squalloc_upper_levels(pos,
+ pos->lock.
+ node,
+ right_lock.node);
+ if (ret)
+ goto out;
+ /*
+ * ... processing the child
+ */
+ ret = lock_parent_and_allocate_znode(right_lock.node,
+ pos);
+ if (ret)
+ goto out;
+ }
+ } else {
+ /*
+ * right twig node is not dirty
+ */
+ coord_init_first_unit(&at_right, right_lock.node);
+ /*
+ * check first child of next twig, should we continue there ?
+ */
+ ret = get_leftmost_child_of_unit(&at_right, &child);
+ if (ret || child == NULL || jnode_check_flushprepped(child)) {
+ pos_stop(pos);
+ goto out;
+ }
+ /*
+ * check clean twig for possible relocation
+ */
+ if (!znode_check_flushprepped(right_lock.node)) {
+ ret = check_parent_for_realloc(child, &at_right, pos);
+ if (ret)
+ goto out;
+ if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
+ goto became_dirty;
+ }
+ }
+ assert("zam-875", znode_check_flushprepped(right_lock.node));
+ /*
+ * Update the preceder by a block number of just processed right
+ * twig node. The code above could miss the preceder updating
+ * because allocate_znode() could not be called for this node
+ */
+ fbi_update_preceder(&pos->mfbi, *znode_get_block(right_lock.node));
+
+ coord_init_first_unit(&at_right, right_lock.node);
+ assert("zam-868", coord_is_existing_unit(&at_right));
+
+ pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
+ move_flush_pos(pos, &right_lock, &right_load, &at_right);
+ out:
+ done_load_count(&right_load);
+ done_lh(&right_lock);
+
+ if (child)
+ jput(child);
+ return ret;
+}
+
+/* Move the pos->lock to leaf node pointed by pos->coord, check should we
+ * continue there. */
+static int handle_pos_to_leaf(flush_pos_t *pos)
+{
+ int ret;
+ lock_handle child_lock;
+ load_count child_load;
+ jnode *child;
+
+ assert("zam-846", pos->state == POS_TO_LEAF);
+ assert("zam-847", item_is_internal(&pos->coord));
+
+ init_lh(&child_lock);
+ init_load_count(&child_load);
+
+ ret = get_leftmost_child_of_unit(&pos->coord, &child);
+ if (ret)
+ return ret;
+ if (child == NULL) {
+ pos_stop(pos);
+ return 0;
+ }
+
+ if (jnode_check_flushprepped(child)) {
+ pos->state = POS_INVALID;
+ goto out;
+ }
+
+ ret =
+ longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_LOPRI);
+ if (ret)
+ goto out;
+
+ ret = incr_load_count_znode(&child_load, JZNODE(child));
+ if (ret)
+ goto out;
+
+ ret = allocate_znode(JZNODE(child), &pos->coord, pos);
+ if (ret)
+ goto out;
+
+ /* move flush position to leaf level */
+ pos->state = POS_ON_LEAF;
+ move_flush_pos(pos, &child_lock, &child_load, NULL);
+
+ if (node_is_empty(JZNODE(child))) {
+ ret = delete_empty_node(JZNODE(child));
+ pos->state = POS_INVALID;
+ }
+out:
+ done_load_count(&child_load);
+ done_lh(&child_lock);
+ jput(child);
+
+ return ret;
+}
+
+/* move pos from leaf to twig, and move lock from leaf to twig. */
+/* Move pos->lock to upper (twig) level */
+static int handle_pos_to_twig(flush_pos_t *pos)
+{
+ int ret;
+
+ lock_handle parent_lock;
+ load_count parent_load;
+ coord_t pcoord;
+
+ assert("zam-852", pos->state == POS_TO_TWIG);
+
+ init_lh(&parent_lock);
+ init_load_count(&parent_load);
+
+ ret = reiser4_get_parent(&parent_lock,
+ pos->lock.node, ZNODE_WRITE_LOCK);
+ if (ret)
+ goto out;
+
+ ret = incr_load_count_znode(&parent_load, parent_lock.node);
+ if (ret)
+ goto out;
+
+ ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
+ if (ret)
+ goto out;
+
+ assert("zam-870", item_is_internal(&pcoord));
+ coord_next_item(&pcoord);
+
+ if (coord_is_after_rightmost(&pcoord))
+ pos->state = POS_END_OF_TWIG;
+ else if (item_is_extent(&pcoord)) {
+ pos->state = POS_ON_EPOINT;
+ } else {
+ /*
+ * Here we understand that getting -E_NO_NEIGHBOR in
+ * handle_pos_on_leaf() was because of just a reaching
+ * edge of slum
+ */
+ pos_stop(pos);
+ goto out;
+ }
+ move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
+out:
+ done_load_count(&parent_load);
+ done_lh(&parent_lock);
+ return ret;
+}
+
+typedef int (*pos_state_handle_t) (flush_pos_t *);
+static pos_state_handle_t flush_pos_handlers[] = {
+ /* process formatted nodes on leaf level, keep lock on a leaf node */
+ [POS_ON_LEAF] = handle_pos_on_leaf,
+ /* process unformatted nodes, keep lock on twig node, pos->coord points
+ * to extent currently being processed */
+ [POS_ON_EPOINT] = handle_pos_on_twig,
+ /* move a lock from leaf node to its parent for further processing of
+ unformatted nodes */
+ [POS_TO_TWIG] = handle_pos_to_twig,
+ /* move a lock from twig to leaf level when a processing of unformatted
+ * nodes finishes, pos->coord points to the leaf node we jump to */
+ [POS_TO_LEAF] = handle_pos_to_leaf,
+ /* this is called after processing last extent in the twig node.
+ * This handler attempts to shift items from the right neighbor (on the
+ * twig level) and process them while shifting. Specifically, for extent
+ * items extent allocation in the "squeeze context" is performed */
+ [POS_END_OF_TWIG] = handle_pos_end_of_twig,
+ /* process formatted nodes on internal level, keep lock on an internal
+ node */
+ [POS_ON_INTERNAL] = handle_pos_on_internal
+};
+
+/* Advance flush position horizontally, prepare for flushing ((re)allocate,
+ * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
+static int squalloc(flush_pos_t *pos)
+{
+ int ret = 0;
+
+ /* maybe needs to be made a case statement with handle_pos_on_leaf as
+ * first case, for greater CPU efficiency? Measure and see.... -Hans */
+ while (pos_valid(pos)) {
+ ret = flush_pos_handlers[pos->state] (pos);
+ if (ret < 0)
+ break;
+
+ ret = rapid_flush(pos);
+ if (ret)
+ break;
+ }
+
+ /* any positive value or -E_NO_NEIGHBOR are legal return codes for
+ handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
+ reached */
+ if (ret > 0 || ret == -E_NO_NEIGHBOR)
+ ret = 0;
+
+ return ret;
+}
+
+static void update_ldkey(znode * node)
+{
+ reiser4_key ldkey;
+
+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
+ if (node_is_empty(node))
+ return;
+
+ znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
+}
+
+/* this is to be called after calling of shift node's method to shift data from
+ @right to @left. It sets left delimiting keys of @left and @right to keys of
+ first items of @left and @right correspondingly and sets right delimiting key
+ of @left to first key of @right */
+static void update_znode_dkeys(znode * left, znode * right)
+{
+ assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
+ assert("vs-1629", (znode_is_write_locked(left) &&
+ znode_is_write_locked(right)));
+
+ /* we need to update left delimiting of left if it was empty before
+ shift */
+ update_ldkey(left);
+ update_ldkey(right);
+ if (node_is_empty(right))
+ znode_set_rd_key(left, znode_get_rd_key(right));
+ else
+ znode_set_rd_key(left, znode_get_ld_key(right));
+}
+
+/*
+ * try to shift everything from @right to @left. If everything was shifted -
+ * @right is removed from the tree. Result is the number of bytes shifted
+ */
+static int shift_everything_left(znode *right, znode *left, carry_level *todo)
+{
+ coord_t from;
+ node_plugin *nplug;
+ carry_plugin_info info;
+
+ coord_init_after_last_item(&from, right);
+
+ nplug = node_plugin_by_node(right);
+ info.doing = NULL;
+ info.todo = todo;
+ return nplug->shift(&from, left, SHIFT_LEFT,
+ 1, /* delete @right if it becomes empty */
+ 1, /* move coord @from to node @left if
+ everything will be shifted */
+ &info);
+}
+
+/* Shift as much as possible from @right to @left using the memcpy-optimized
+ shift_everything_left. @left and @right are formatted neighboring nodes on
+ leaf level. */
+static int squeeze_right_non_twig(znode * left, znode * right)
+{
+ int ret;
+ carry_pool *pool;
+ carry_level *todo;
+ reiser4_subvol *subv = znode_get_subvol(left);
+
+ assert("edward-1729", subv != NULL);
+ assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
+ assert("edward-1730", znode_get_subvol(left) == znode_get_subvol(right));
+
+ if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
+ !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
+ return SQUEEZE_TARGET_FULL;
+
+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ todo = (carry_level *) (pool + 1);
+ init_carry_level(todo, pool);
+
+ ret = shift_everything_left(right, left, todo);
+ if (ret > 0) {
+ /* something was shifted */
+ reiser4_tree *tree;
+ __u64 grabbed;
+
+ znode_make_dirty(left);
+ znode_make_dirty(right);
+ /*
+ * update delimiting keys of nodes which participated in
+ * shift. FIXME: it would be better to have this in shift
+ * node's operation. But it can not be done there. Nobody
+ * remembers why, though
+ */
+ tree = znode_get_tree(left);
+ write_lock_dk(tree);
+ update_znode_dkeys(left, right);
+ write_unlock_dk(tree);
+ /*
+ * Carry is called to update delimiting key and, maybe,
+ * to remove empty node
+ */
+ grabbed = ctx_subvol_grabbed(get_current_context(), subv->id);
+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED, subv);
+ assert("nikita-3003", ret == 0); /* reserved space is
+ exhausted. Ask Hans */
+ ret = reiser4_carry(todo, NULL/* previous level */);
+ grabbed2free_mark(grabbed, subv);
+ } else {
+ /*
+ * Shifting impossible, we return appropriate result code
+ */
+ ret = node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
+ SQUEEZE_TARGET_FULL;
+ }
+ done_carry_pool(pool);
+ return ret;
+}
+
+#if REISER4_DEBUG
+static int sibling_link_is_ok(const znode *left, const znode *right)
+{
+ int result;
+
+ read_lock_tree();
+ result = (left->right == right && left == right->left);
+ read_unlock_tree();
+ return result;
+}
+#endif
+
+/* Shift first unit of first item if it is an internal one. Return
+ SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
+ SUBTREE_MOVED. */
+static int shift_one_internal_unit(znode * left, znode * right)
+{
+ int ret;
+ carry_pool *pool;
+ carry_level *todo;
+ coord_t *coord;
+ carry_plugin_info *info;
+ int size, moved;
+
+ assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
+ assert("nikita-2435", znode_is_write_locked(left));
+ assert("nikita-2436", znode_is_write_locked(right));
+ assert("nikita-2434", sibling_link_is_ok(left, right));
+
+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
+ sizeof(*coord) + sizeof(*info)
+#if REISER4_DEBUG
+ + sizeof(*coord) + 2 * sizeof(reiser4_key)
+#endif
+ );
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ todo = (carry_level *) (pool + 1);
+ init_carry_level(todo, pool);
+
+ coord = (coord_t *) (todo + 3);
+ coord_init_first_unit(coord, right);
+ info = (carry_plugin_info *) (coord + 1);
+
+#if REISER4_DEBUG
+ if (!node_is_empty(left)) {
+ coord_t *last;
+ reiser4_key *right_key;
+ reiser4_key *left_key;
+
+ last = (coord_t *) (info + 1);
+ right_key = (reiser4_key *) (last + 1);
+ left_key = right_key + 1;
+ coord_init_last_unit(last, left);
+
+ assert("nikita-2463",
+ keyle(item_key_by_coord(last, left_key),
+ item_key_by_coord(coord, right_key)));
+ }
+#endif
+
+ assert("jmacd-2007", item_is_internal(coord));
+
+ size = item_length_by_coord(coord);
+ info->todo = todo;
+ info->doing = NULL;
+
+ ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
+ 1
+ /* delete @right if it becomes
+ empty */
+ ,
+ 0
+ /* do not move coord @coord to
+ node @left */
+ ,
+ info);
+
+ /* If shift returns positive, then we shifted the item. */
+ assert("vs-423", ret <= 0 || size == ret);
+ moved = (ret > 0);
+
+ if (moved) {
+ /* something was moved */
+ reiser4_tree *tree;
+ int grabbed;
+ reiser4_subvol *subv = znode_get_subvol(left);
+
+ znode_make_dirty(left);
+ znode_make_dirty(right);
+ tree = znode_get_tree(left);
+ write_lock_dk(tree);
+ update_znode_dkeys(left, right);
+ write_unlock_dk(tree);
+ /*
+ * reserve space for delimiting keys after shifting
+ */
+ grabbed = ctx_subvol_grabbed(get_current_context(), subv->id);
+ ret = reiser4_grab_space_force(tree->height, BA_RESERVED, subv);
+ assert("nikita-3003", ret == 0); /* reserved space is
+ exhausted. Ask Hans. */
+ ret = reiser4_carry(todo, NULL/* previous level */);
+ grabbed2free_mark(grabbed, subv);
+ }
+ done_carry_pool(pool);
+
+ if (ret != 0) {
+ /* Shift or carry operation failed. */
+ assert("jmacd-7325", ret < 0);
+ return ret;
+ }
+ return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
+}
+
+static int allocate_znode(znode *node,
+ const coord_t *parent_coord, flush_pos_t *pos)
+{
+ txmod_plugin *plug;
+
+ plug = txmod_plugin_by_id(get_meta_subvol()->txmod);
+ /*
+ * perform znode allocation with znode pinned in memory to avoid races
+ * with asynchronous emergency flush (which plays with
+ * JNODE_FLUSH_RESERVED bit).
+ */
+ return WITH_DATA(node, plug->forward_alloc_formatted(node,
+ parent_coord,
+ pos));
+}
+
+
+/* JNODE INTERFACE */
+
+/* Lock a node (if formatted) and then get its parent locked, set the child's
+ coordinate in the parent. If the child is the root node, the above_root
+ znode is returned but the coord is not set. This function may cause atom
+ fusion, but it is only used for read locks (at this point) and therefore
+ fusion only occurs when the parent is already dirty. */
+/* Hans adds this note: remember to ask how expensive this operation is vs.
+ storing parent pointer in jnodes. */
+static int
+jnode_lock_parent_coord(jnode * node,
+ coord_t *coord,
+ lock_handle * parent_lh,
+ load_count * parent_zh,
+ znode_lock_mode parent_mode, int try)
+{
+ int ret;
+
+ assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
+ assert("edward-54", jnode_is_unformatted(node)
+ || znode_is_any_locked(JZNODE(node)));
+
+ if (!jnode_is_znode(node)) {
+ reiser4_key key;
+ tree_level stop_level = TWIG_LEVEL;
+ lookup_bias bias = FIND_EXACT;
+
+ assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
+ assert("edward-2163", !jnode_is_volinfo_head(node));
+
+ /* The case when node is not znode, but can have parent coord
+ (unformatted node, node which represents cluster page,
+ etc..). Generate a key for the appropriate entry, search
+ in the tree using coord_by_key, which handles locking for
+ us. */
+
+ /*
+ * nothing is locked at this moment, so, nothing prevents
+ * concurrent truncate from removing jnode from inode. To
+ * prevent this spin-lock jnode. jnode can be truncated just
+ * after call to the jnode_build_key(), but this is ok,
+ * because coord_by_key() will just fail to find appropriate
+ * extent.
+ */
+ spin_lock_jnode(node);
+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+ jnode_build_key(node, &key);
+ ret = 0;
+ } else
+ ret = RETERR(-ENOENT);
+ spin_unlock_jnode(node);
+
+ if (ret != 0)
+ return ret;
+
+ if (jnode_is_cluster_page(node))
+ stop_level = LEAF_LEVEL;
+
+ assert("jmacd-1812", coord != NULL);
+
+ ret = coord_by_key(meta_subvol_tree(), &key, coord, parent_lh,
+ parent_mode, bias, stop_level, stop_level,
+ CBK_UNIQUE, NULL/*ra_info */);
+ switch (ret) {
+ case CBK_COORD_NOTFOUND:
+ assert("edward-1038",
+ ergo(jnode_is_cluster_page(node),
+ JF_ISSET(node, JNODE_HEARD_BANSHEE)));
+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+ assert("edward-2164", 0);
+ warning("nikita-3177", "Parent not found");
+ }
+ return ret;
+ case CBK_COORD_FOUND:
+ if (coord->between != AT_UNIT) {
+ /* FIXME: comment needed */
+ done_lh(parent_lh);
+ if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+ assert("edward-2366", 0);
+ warning("nikita-3178",
+ "Found but not happy: %i",
+ coord->between);
+ }
+ return RETERR(-ENOENT);
+ }
+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
+ if (ret != 0)
+ return ret;
+ /* if (jnode_is_cluster_page(node)) {
+ races with write() are possible
+ check_child_cluster (parent_lh->node);
+ }
+ */
+ break;
+ default:
+ return ret;
+ }
+
+ } else {
+ int flags;
+ znode *z;
+
+ z = JZNODE(node);
+ /* Formatted node case: */
+ assert("jmacd-2061", !znode_is_root(z));
+
+ flags = GN_ALLOW_NOT_CONNECTED;
+ if (try)
+ flags |= GN_TRY_LOCK;
+
+ ret =
+ reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
+ if (ret != 0)
+ /* -E_REPEAT is ok here, it is handled by the caller. */
+ return ret;
+
+ /* Make the child's position "hint" up-to-date. (Unless above
+ root, which caller must check.) */
+ if (coord != NULL) {
+
+ ret = incr_load_count_znode(parent_zh, parent_lh->node);
+ if (ret != 0) {
+ warning("jmacd-976812386",
+ "incr_load_count_znode failed: %d",
+ ret);
+ return ret;
+ }
+
+ ret = find_child_ptr(parent_lh->node, z, coord);
+ if (ret != 0) {
+ warning("jmacd-976812",
+ "find_child_ptr failed: %d", ret);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Get the (locked) next neighbor of a znode which is dirty and a member of the
+ same atom. If there is no next neighbor or the neighbor is not in memory or
+ if there is a neighbor but it is not dirty or not in the same atom,
+ -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
+ are not dirty, if so @check_dirty should be 0 */
+static int neighbor_in_slum(znode * node, /* starting point */
+ lock_handle * lock, /* lock on starting point */
+ sideof side, /* left or right direction we
+ seek the next node in */
+ znode_lock_mode mode, /* kind of lock we want */
+ int check_dirty, /* true if the neighbor should
+ be dirty */
+ int use_upper_levels /* get neighbor by going though
+ upper levels */)
+{
+ int ret;
+ int flags;
+
+ assert("jmacd-6334", znode_is_connected(node));
+
+ flags = GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
+ if (use_upper_levels)
+ flags |= GN_CAN_USE_UPPER_LEVELS;
+
+ ret = reiser4_get_neighbor(lock, node, mode, flags);
+ if (ret) {
+ /* May return -ENOENT or -E_NO_NEIGHBOR. */
+ /* FIXME(C): check EINVAL, E_DEADLOCK */
+ if (ret == -ENOENT)
+ ret = RETERR(-E_NO_NEIGHBOR);
+ return ret;
+ }
+ if (!check_dirty)
+ return 0;
+ /* Check dirty bit of locked znode, no races here */
+ if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
+ return 0;
+
+ done_lh(lock);
+ return RETERR(-E_NO_NEIGHBOR);
+}
+
+/* Return true if two znodes have the same parent. This is called with both
+ nodes write-locked (for squeezing) so no tree lock is needed. */
+static int znode_same_parents(znode * a, znode * b)
+{
+ int result;
+
+ assert("jmacd-7011", znode_is_write_locked(a));
+ assert("jmacd-7012", znode_is_write_locked(b));
+
+ /* We lock the whole tree for this check.... I really don't like whole
+ * tree locks... -Hans */
+ read_lock_tree();
+ result = (znode_parent(a) == znode_parent(b));
+ read_unlock_tree();
+ return result;
+}
+
+/* FLUSH SCAN */
+
+/* Initialize the flush_scan data structure. */
+static void scan_init(flush_scan * scan, flush_pos_t *pos)
+{
+ memset(scan, 0, sizeof(*scan));
+ init_lh(&scan->node_lock);
+ init_lh(&scan->parent_lock);
+ init_load_count(&scan->parent_load);
+ init_load_count(&scan->node_load);
+ coord_init_invalid(&scan->parent_coord, NULL);
+ scan->bricks_info = &pos->bricks_info;
+ scan->mfbi = &pos->mfbi;
+}
+
+/* Release any resources held by the flush scan, e.g. release locks,
+ free memory, etc. */
+static void scan_done(flush_scan * scan)
+{
+ done_load_count(&scan->node_load);
+ if (scan->node != NULL) {
+ jput(scan->node);
+ scan->node = NULL;
+ }
+ done_load_count(&scan->parent_load);
+ done_lh(&scan->parent_lock);
+ done_lh(&scan->node_lock);
+}
+
+/**
+ * Returns true if flush scanning has to be finished
+ */
+int reiser4_scan_finished(flush_scan *scan)
+{
+ return scan->stop || (scan->direction == RIGHT_SIDE &&
+ scan->count >= scan->max_count);
+}
+
+/**
+ * Return true if the scan should continue to the @tonode.
+ * True if the node meets the same_slum_check condition.
+ * If not, deref the "left" node and stop the scan
+ */
+int reiser4_scan_goto(flush_scan *scan, jnode *tonode)
+{
+ int go = same_slum_check(scan->node, tonode, 1, 0);
+
+ if (!go) {
+ scan->stop = 1;
+ jput(tonode);
+ }
+ return go;
+}
+
+/**
+ * Move scan position to @node:
+ * set scan->node to @node, refcount it, deref node at previous position,
+ * optionally copy the parent coordinate, increment count by the @add_count,
+ * which indicates number of processed nodes
+ */
+int move_scan_pos(flush_scan *scan, jnode *node,
+ unsigned add_count, const coord_t *parent)
+{
+ struct flush_brick_info *fbi = NULL;
+ /*
+ * Release the old references, take the new reference
+ */
+ done_load_count(&scan->node_load);
+
+ if (scan->node != NULL)
+ jput(scan->node);
+
+ fbi = grab_fbi(scan->bricks_info, scan->mfbi, node->subvol->id);
+ if (fbi == NULL)
+ return RETERR(-ENOMEM);
+
+ scan->node = node;
+
+ fbi->count += add_count;
+ scan->count += add_count;
+
+ /* This next stmt is somewhat inefficient. The reiser4_scan_extent()
+ code could delay this update step until it finishes and update the
+ parent_coord only once. It did that before, but there was a bug and
+ this was the easiest way to make it correct
+ */
+ if (parent != NULL)
+ coord_dup(&scan->parent_coord, parent);
+ /*
+ * Failure may happen at the incr_load_count call, but the caller can
+ * assume the reference is safely taken
+ */
+ return incr_load_count_jnode(&scan->node_load, node);
+}
+
+/* Return true if scanning in the leftward direction. */
+int reiser4_scanning_left(flush_scan * scan)
+{
+ return scan->direction == LEFT_SIDE;
+}
+
+/* Performs leftward scanning starting from either kind of node. Counts the
+ starting node. The right-scan object is passed in for the left-scan in order
+ to copy the parent of an unformatted starting position. This way we avoid
+ searching for the unformatted node's parent when scanning in each direction.
+ If we search for the parent once it is set in both scan objects. The limit
+ parameter tells flush-scan when to stop.
+
+ Rapid scanning is used only during scan_left, where we are interested in
+ finding the 'leftpoint' where we begin flushing. We are interested in
+ stopping at the left child of a twig that does not have a dirty left
+ neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
+ those nodes without unallocated children, and it is difficult to solve in the
+ bottom-up flushing algorithm we are currently using. The problem can be
+ solved by scanning left at every level as we go upward, but this would
+ basically bring us back to using a top-down allocation strategy, which we
+ already tried (see BK history from May 2002), and has a different set of
+ problems. The top-down strategy makes avoiding unallocated children easier,
+ but makes it difficult to propertly flush dirty children with clean parents
+ that would otherwise stop the top-down flush, only later to dirty the parent
+ once the children are flushed. So we solve the problem in the bottom-up
+ algorithm with a special case for twigs and leaves only.
+
+ The first step in solving the problem is this rapid leftward scan. After we
+ determine that there are at least enough nodes counted to qualify for
+ FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
+ are only interested in finding the best place to start the flush.
+
+ We could choose one of two possibilities:
+
+ 1. Stop at the leftmost child (of a twig) that does not have a dirty left
+ neighbor. This requires checking one leaf per rapid-scan twig
+
+ 2. Stop at the leftmost child (of a twig) where there are no dirty children
+ of the twig to the left. This requires checking possibly all of the in-memory
+ children of each twig during the rapid scan.
+
+ For now we implement the first policy.
+*/
+static int scan_left(flush_scan *scan, flush_scan *right, jnode *node)
+{
+ int ret = 0;
+
+ scan->direction = LEFT_SIDE;
+
+ ret = move_scan_pos(scan, jref(node), 1, NULL);
+ if (ret != 0)
+ return ret;
+
+ ret = do_scan(scan, right);
+ if (ret != 0)
+ return ret;
+
+ /* Before rapid scanning, we need a lock on scan->node so that we can
+ get its parent, only if formatted. */
+ if (jnode_is_znode(scan->node)) {
+ ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
+ }
+
+ /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
+ */
+ return ret;
+}
+
+/* Performs rightward scanning... Does not count the starting node. The limit
+ parameter is described in scan_left. If the starting node is unformatted then
+ the parent_coord was already set during scan_left. The rapid_after parameter
+ is not used during right-scanning.
+
+ scan_right is only called if the scan_left operation does not count at least
+ FLUSH_RELOCATE_THRESHOLD nodes for flushing. Otherwise, the limit parameter
+ is set to the difference between scan-left's count and
+ FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
+ FLUSH_RELOCATE_THRESHOLD and then stops
+*/
+static int scan_right(flush_scan *scan, jnode *node)
+{
+ int ret;
+
+ scan->direction = RIGHT_SIDE;
+
+ ret = move_scan_pos(scan, jref(node), 0, NULL);
+ if (ret != 0)
+ return ret;
+
+ return do_scan(scan, NULL);
+}
+
+/**
+ * Perform scan in a given direction
+ */
+static int do_scan(flush_scan *scan, flush_scan *other)
+{
+ int ret;
+
+ assert("nikita-2376", scan->node != NULL);
+ assert("edward-54",
+ jnode_is_unformatted(scan->node) || jnode_is_znode(scan->node));
+ /*
+ * Special case for starting at an unformatted node. Optimization: we
+ * only want to search for the parent (which requires a tree traversal)
+ * once. Obviously, we shouldn't have to call it once for the left scan
+ * and once for the right scan. For this reason, if we search for the
+ * parent during scan-left we then duplicate the coord/lock/load into
+ * the scan-right object
+ */
+ if (jnode_is_unformatted(scan->node)) {
+ ret = lock_parent_and_scan_upper_level(scan, other);
+ if (ret != 0)
+ return ret;
+ }
+ /*
+ * scan formatted nodes starting at current position
+ */
+ while (!reiser4_scan_finished(scan)) {
+
+ ret = scan_formatted(scan);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Set up parent coord (if needed), jump one level up
+ * and scan formatted nodes on the upper level
+ */
+static int lock_parent_and_scan_upper_level(flush_scan *scan, flush_scan *other)
+{
+ int ret = 0;
+ int try = 0;
+
+ if (!coord_is_invalid(&scan->parent_coord))
+ /*
+ * parent has been set already by
+ * previous scan session (scan_left)
+ */
+ goto scan;
+ /*
+ * set parent coord
+ */
+ if (!jnode_is_unformatted(scan->node)) {
+ /* formatted position */
+
+ lock_handle lock;
+ assert("edward-301", jnode_is_znode(scan->node));
+ init_lh(&lock);
+
+ /*
+ * when flush starts from unformatted node, first thing it
+ * does is tree traversal to find formatted parent of starting
+ * node. This parent is then kept lock across scans to the
+ * left and to the right. This means that during scan to the
+ * left we cannot take left-ward lock, because this is
+ * dead-lock prone. So, if we are scanning to the left and
+ * there is already lock held by this thread,
+ * jnode_lock_parent_coord() should use try-lock.
+ */
+ try = reiser4_scanning_left(scan)
+ && !lock_stack_isclean(get_current_lock_stack());
+ /* Need the node locked to get the parent lock, We have to
+ take write lock since there is at least one call path
+ where this znode is already write-locked by us. */
+ ret =
+ longterm_lock_znode(&lock, JZNODE(scan->node),
+ ZNODE_WRITE_LOCK,
+ reiser4_scanning_left(scan) ?
+ ZNODE_LOCK_LOPRI :
+ ZNODE_LOCK_HIPRI);
+ if (ret != 0)
+ /* EINVAL or E_DEADLOCK here mean... try again! At this
+ point we've scanned too far and can't back out, just
+ start over. */
+ return ret;
+
+ ret = jnode_lock_parent_coord(scan->node,
+ &scan->parent_coord,
+ &scan->parent_lock,
+ &scan->parent_load,
+ ZNODE_WRITE_LOCK, try);
+
+ /* FIXME(C): check EINVAL, E_DEADLOCK */
+ done_lh(&lock);
+ if (ret == -E_REPEAT) {
+ scan->stop = 1;
+ return 0;
+ }
+ if (ret)
+ return ret;
+
+ } else {
+ /* unformatted position */
+
+ ret =
+ jnode_lock_parent_coord(scan->node, &scan->parent_coord,
+ &scan->parent_lock,
+ &scan->parent_load,
+ ZNODE_WRITE_LOCK, try);
+
+ if (IS_CBKERR(ret))
+ return ret;
+
+ if (ret == CBK_COORD_NOTFOUND)
+ /* FIXME(C): check EINVAL, E_DEADLOCK */
+ return ret;
+
+ /* parent was found */
+ assert("jmacd-8661", other != NULL);
+ /* Duplicate the reference into the other flush_scan. */
+ coord_dup(&other->parent_coord, &scan->parent_coord);
+ copy_lh(&other->parent_lock, &scan->parent_lock);
+ copy_load_count(&other->parent_load, &scan->parent_load);
+ }
+scan:
+ /*
+ * proceed with scanning formatted nodes on the upper level
+ */
+ return scan_by_coord(scan);
+}
+
+/* Performs left- or rightward scanning starting from a formatted node. Follow
+ left pointers under tree lock as long as:
+
+ - node->left/right is non-NULL
+ - node->left/right is connected, dirty
+ - node->left/right belongs to the same atom
+ - scan has not reached maximum count
+*/
+static int scan_formatted(flush_scan * scan)
+{
+ int ret;
+ znode *neighbor = NULL;
+
+ assert("jmacd-1401", !reiser4_scan_finished(scan));
+
+ do {
+ znode *node = JZNODE(scan->node);
+ /*
+ * node should be connected, but if not stop the scan
+ */
+ if (!znode_is_connected(node)) {
+ scan->stop = 1;
+ break;
+ }
+ /* Lock the tree, check-for and reference the next sibling. */
+ read_lock_tree();
+
+ /* It may be that a node is inserted or removed between a node
+ and its left sibling while the tree lock is released, but the
+ flush-scan count does not need to be precise. Thus, we
+ release the tree lock as soon as we get the neighboring node.
+ */
+ neighbor =
+ reiser4_scanning_left(scan) ? node->left : node->right;
+ if (neighbor != NULL)
+ zref(neighbor);
+
+ read_unlock_tree();
+ /*
+ * If neighbor is NULL at the leaf level, need to check for an
+ * unformatted sibling using the parent--break in any case
+ */
+ if (neighbor == NULL)
+ break;
+ /*
+ * Check the condition for going left, break if it is not met.
+ * This also releases (jputs) the neighbor if false
+ */
+ if (!reiser4_scan_goto(scan, ZJNODE(neighbor)))
+ break;
+ /*
+ * Advance the flush_scan state to the left, repeat
+ */
+ ret = move_scan_pos(scan, ZJNODE(neighbor), 1, NULL);
+ if (ret != 0)
+ return ret;
+
+ } while (!reiser4_scan_finished(scan));
+ /*
+ * If neighbor is NULL then we reached the end of a formatted region,
+ * or else the sibling is out of memory, now check for an extent to the
+ * left (as long as LEAF_LEVEL)
+ */
+ if (neighbor != NULL ||
+ jnode_get_level(scan->node) != LEAF_LEVEL ||
+ reiser4_scan_finished(scan)) {
+
+ scan->stop = 1;
+ return 0;
+ }
+ /*
+ * otherwise, calls scan_by_coord for the right(left)most item of the
+ * left(right) neighbor on the parent level, then possibly continue
+ */
+ coord_init_invalid(&scan->parent_coord, NULL);
+ return lock_parent_and_scan_upper_level(scan, NULL);
+}
+
+/**
+ * This scans adjacent items of the same type and calls scan flush plugin for
+ * each one. Performs left(right)ward scanning starting from a (possibly)
+ * unformatted node. If we start from unformatted node, then we continue only if
+ * the next neighbor is also unformatted. When called from scan_formatted, we
+ * skip first iteration (to make sure that right(left)most item of the
+ * left(right) neighbor on the parent level is of the same type and set
+ * appropriate coord)
+ */
+static int scan_by_coord(flush_scan * scan)
+{
+ int ret = 0;
+ int scan_this_coord;
+ lock_handle next_lock;
+ load_count next_load;
+ coord_t next_coord;
+ jnode *child;
+ item_plugin *iplug;
+
+ init_lh(&next_lock);
+ init_load_count(&next_load);
+ scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
+
+ /* set initial item id */
+ iplug = item_plugin_by_coord(&scan->parent_coord);
+
+ for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
+ if (scan_this_coord) {
+ /*
+ * Here we expect that unit is scannable.
+ * It would not be so due to race with extent->tail
+ * conversion
+ */
+ if (iplug->f.scan == NULL) {
+ scan->stop = 1;
+ ret = -E_REPEAT;
+ /* skip the check at the end. */
+ goto race;
+ }
+ ret = iplug->f.scan(scan);
+ if (ret != 0)
+ goto exit;
+
+ if (reiser4_scan_finished(scan)) {
+ checkchild(scan);
+ break;
+ }
+ } else {
+ /*
+ * the same race against truncate as above is possible
+ * here, it seems.
+ *
+ * NOTE-JMACD: In this case, apply the same end-of-node
+ * logic but don't scan the first coordinate
+ */
+ assert("jmacd-1231",
+ item_is_internal(&scan->parent_coord));
+ }
+ if (iplug->f.utmost_child == NULL ||
+ znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
+ /*
+ * stop this coord and continue on parrent level
+ * (see the function do_scan)
+ */
+ ret = move_scan_pos(scan,
+ ZJNODE(zref(scan->parent_coord.node)),
+ 1, NULL);
+ if (ret != 0)
+ goto exit;
+ break;
+ }
+ /*
+ * Either way, the invariant is that scan->parent_coord is set
+ * to the parent of scan->node. Now get the next unit
+ */
+ coord_dup(&next_coord, &scan->parent_coord);
+ coord_sideof_unit(&next_coord, scan->direction);
+ /*
+ * If off-the-end of the twig, try the next twig
+ */
+ if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
+ /*
+ * We take the write lock because we may start
+ * flushing from this coordinate
+ */
+ ret = neighbor_in_slum(next_coord.node,
+ &next_lock,
+ scan->direction,
+ ZNODE_WRITE_LOCK,
+ 1 /* check dirty */,
+ 0 /* don't go though upper
+ levels */);
+ if (ret == -E_NO_NEIGHBOR) {
+ scan->stop = 1;
+ ret = 0;
+ break;
+ }
+ if (ret != 0)
+ goto exit;
+ ret = incr_load_count_znode(&next_load, next_lock.node);
+ if (ret != 0)
+ goto exit;
+ coord_init_sideof_unit(&next_coord, next_lock.node,
+ sideof_reverse(scan->direction));
+ }
+ iplug = item_plugin_by_coord(&next_coord);
+ /*
+ * Get the next child
+ */
+ ret = iplug->f.utmost_child(&next_coord,
+ sideof_reverse(scan->direction),
+ &child);
+ if (ret != 0)
+ goto exit;
+ /*
+ * If the next child is not in memory, or, item_utmost_child
+ * failed (due to race with unlink, most probably), stop here
+ */
+ if (child == NULL || IS_ERR(child)) {
+ scan->stop = 1;
+ checkchild(scan);
+ break;
+ }
+ assert("nikita-2374",
+ jnode_is_unformatted(child) || jnode_is_znode(child));
+ /*
+ * See if it is dirty, part of the same atom
+ */
+ if (!reiser4_scan_goto(scan, child)) {
+ checkchild(scan);
+ break;
+ }
+ /*
+ * If so, make this child current
+ */
+ ret = move_scan_pos(scan, child, 1, &next_coord);
+ if (ret != 0)
+ goto exit;
+ /*
+ * Now continue.
+ * If formatted we release the parent lock and return,
+ * then proceed
+ */
+ if (jnode_is_znode(child))
+ break;
+ /*
+ * Otherwise, repeat the above loop with next_coord
+ */
+ if (next_load.node != NULL) {
+ done_lh(&scan->parent_lock);
+ move_lh(&scan->parent_lock, &next_lock);
+ move_load_count(&scan->parent_load, &next_load);
+ }
+ }
+ assert("jmacd-6233",
+ reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
+ exit:
+ checkchild(scan);
+ race:
+ if (jnode_is_znode(scan->node)) {
+ done_lh(&scan->parent_lock);
+ done_load_count(&scan->parent_load);
+ }
+ done_load_count(&next_load);
+ done_lh(&next_lock);
+ return ret;
+}
+
+/* FLUSH POS HELPERS */
+
+/* Initialize the fields of a flush_position. */
+static void pos_init(flush_pos_t *pos)
+{
+ memset(pos, 0, sizeof *pos);
+
+ pos->state = POS_INVALID;
+ coord_init_invalid(&pos->coord, NULL);
+ init_lh(&pos->lock);
+ init_load_count(&pos->load);
+ /*
+ * init set of per-brick infos and populate it
+ * with pre-allocated item for meta-data brick
+ */
+ pos->bricks_info = RB_ROOT;
+ /*
+ * populate the rb-tree with pre-allocated info
+ * for meta-data brick
+ */
+ init_fbi(&pos->mfbi, METADATA_SUBVOL_ID);
+ insert_fbi(&pos->bricks_info, &pos->mfbi);
+}
+
+/* The flush loop inside squalloc periodically checks pos_valid to determine
+ when "enough flushing" has been performed. This will return true until one
+ of the following conditions is met:
+
+ 1. the number of flush-queued nodes has reached the kernel-supplied
+ "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
+ kernel requested. When flushing to commit, this parameter is NULL.
+
+ 2. pos_stop() is called because squalloc discovers that the "next" node in
+ the flush order is either non-existant, not dirty, or not in the same atom.
+*/
+
+static int pos_valid(flush_pos_t *pos)
+{
+ return pos->state != POS_INVALID;
+}
+
+/* Release any resources of a flush_position. Called when jnode_flush
+ finishes. */
+static void pos_done(flush_pos_t *pos)
+{
+ pos_stop(pos);
+ if (convert_data(pos))
+ free_convert_data(pos);
+ done_all_fbi(&pos->bricks_info, &pos->mfbi);
+}
+
+/* Reset the point and parent. Called during flush subroutines to terminate the
+ squalloc loop. */
+static int pos_stop(flush_pos_t *pos)
+{
+ pos->state = POS_INVALID;
+ done_lh(&pos->lock);
+ done_load_count(&pos->load);
+ coord_init_invalid(&pos->coord, NULL);
+
+ if (pos->child) {
+ jput(pos->child);
+ pos->child = NULL;
+ }
+
+ return 0;
+}
+
+flush_queue_t *reiser4_pos_fq(flush_pos_t *pos)
+{
+ return pos->fq;
+}
+
+/********************** flush brick info ops ************************/
+
+int flush_init_static(void)
+{
+ assert("edward-2397", _fbi_slab == NULL);
+
+ _fbi_slab = kmem_cache_create("flush_brick_info",
+ sizeof(flush_brick_info), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ return _fbi_slab == NULL ? RETERR(-ENOMEM) : 0;
+}
+
+/**
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void done_flush_static(void)
+{
+ destroy_reiser4_cache(&_fbi_slab);
+}
+
+struct flush_brick_info *alloc_fbi(void)
+{
+ return kmem_cache_alloc(_fbi_slab, reiser4_ctx_gfp_mask_get());
+}
+
+static void init_fbi(flush_brick_info *fbi, u32 subv_id)
+{
+ memset(fbi, 0, sizeof(*fbi));
+ RB_CLEAR_NODE(&fbi->node);
+ fbi->brick_id = subv_id;
+}
+
+static void free_fbi(struct flush_brick_info *fbi)
+{
+ kmem_cache_free(_fbi_slab, fbi);
+}
+
+static struct flush_brick_info *find_fbi(const struct rb_root *root,
+ u32 brick_id)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct flush_brick_info *fbi =
+ rb_entry(node, struct flush_brick_info, node);
+
+ if (fbi->brick_id > brick_id)
+ node = node->rb_left;
+ else if (fbi->brick_id < brick_id)
+ node = node->rb_right;
+ else
+ return fbi;
+ }
+ return NULL;
+}
+
+/**
+ * Try to insert item @this to rb-tree @root
+ * Return NULL on success. Otherwise, return node of existing item
+ */
+static void insert_fbi(struct rb_root *root, struct flush_brick_info *this)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **pos = &(root->rb_node);
+
+ while (*pos) {
+ struct flush_brick_info *fbi;
+
+ fbi = rb_entry(*pos, struct flush_brick_info, node);
+ parent = *pos;
+
+ if (this->brick_id < fbi->brick_id)
+ pos = &((*pos)->rb_left);
+ else if (this->brick_id > fbi->brick_id)
+ pos = &((*pos)->rb_right);
+ else
+ BUG_ON(1);
+ }
+ rb_link_node(&this->node, parent, pos);
+ rb_insert_color(&this->node, root);
+}
+
+/**
+ * On sucess return pointer to the flush brick info
+ * (existing or newly allocated).
+ * @mfbi: pre-allocated info for meta-data brick
+ */
+static flush_brick_info *grab_fbi(struct rb_root *infos,
+ struct flush_brick_info *mfbi, u32 brick_id)
+{
+ struct flush_brick_info *fbi;
+
+ if (brick_id == METADATA_SUBVOL_ID)
+ /*
+ * It is known to be preallocated
+ */
+ return mfbi;
+ fbi = find_fbi(infos, brick_id);
+ if (fbi)
+ return fbi;
+ /*
+ * Insert a new item to the rb-tree
+ */
+ fbi = alloc_fbi();
+ if (fbi) {
+ init_fbi(fbi, brick_id);
+ insert_fbi(infos, fbi);
+ }
+ return fbi;
+}
+
+static void done_all_fbi(struct rb_root *infos, struct flush_brick_info *mfbi)
+{
+ /*
+ * remove pre-allocated info
+ */
+ rb_erase(&mfbi->node, infos);
+ RB_CLEAR_NODE(&mfbi->node);
+
+ while (!RB_EMPTY_ROOT(infos)) {
+ struct rb_node *node;
+ struct flush_brick_info *fbi;
+
+ node = rb_first(infos);
+ fbi = rb_entry(node, struct flush_brick_info, node);
+
+ rb_erase(&fbi->node, infos);
+ RB_CLEAR_NODE(&fbi->node);
+ free_fbi(fbi);
+ }
+}
+
+reiser4_blocknr_hint *flush_pos_get_hint(flush_pos_t *pos, u32 subv_id,
+ reiser4_blocknr_hint *ehint)
+{
+ flush_brick_info *fbi = NULL;
+
+ fbi = grab_fbi(&pos->bricks_info, &pos->mfbi, subv_id);
+ if (likely(fbi != NULL))
+ return &fbi->preceder;
+ else {
+ /* use emergency hint */
+ memset(ehint, 0, sizeof(*ehint));
+ return ehint;
+ }
+}
+
+void flush_pos_update_preceder(flush_pos_t *pos, u32 subv_id,
+ reiser4_block_nr blk)
+{
+ flush_brick_info *fbi;
+
+ fbi = grab_fbi(&pos->bricks_info, &pos->mfbi, subv_id);
+ if (!fbi)
+ return;
+ fbi_update_preceder(fbi, blk);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 90
+ LocalWords: preceder
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/flush.h linux-5.10.2/fs/reiser4/flush.h
--- linux-5.10.2.orig/fs/reiser4/flush.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/flush.h 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,326 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* DECLARATIONS: */
+
+#if !defined(__REISER4_FLUSH_H__)
+#define __REISER4_FLUSH_H__
+
+#include "plugin/cluster.h"
+#include "plugin/volume/volume.h"
+
+struct flush_brick_info {
+ struct rb_node node;
+ u32 brick_id; /* key */
+
+ /* scan info */
+ int count; /* Number of scanned nodes which belong to this brick.
+ This is used to make relocation decisions */
+ /* squalloc info */
+ reiser4_blocknr_hint preceder; /* The flush 'hint' state */
+};
+
+/* The flush_scan data structure maintains the state of an in-progress
+ flush-scan on a single level of the tree. A flush-scan is used for counting
+ the number of adjacent nodes to flush, which is used to determine whether we
+ should relocate, and it is also used to find a starting point for flush. A
+ flush-scan object can scan in both right and left directions via the
+ scan_left() and scan_right() interfaces. The right- and left-variations are
+ similar but perform different functions. When scanning left we (optionally
+ perform rapid scanning and then) longterm-lock the endpoint node. When
+ scanning right we are simply counting the number of adjacent, dirty nodes. */
+struct flush_scan {
+ struct rb_root *bricks_info;
+ flush_brick_info *mfbi; /* pre-loaded info for meta-data brick */
+ /* The following two fields are used to terminate scan */
+ int count; /* total number of nodes scanned on this level */
+ int max_count; /* maximal total number of nodes to scan on any
+ * single level. When going leftward, then both
+ * counts are restricted by FLUSH_SCAN_MAXNODES */
+ /*
+ * One of the sideof enumeration: {LEFT_SIDE, RIGHT_SIDE}
+ */
+ sideof direction;
+ /*
+ * Initially @stop is set to false then set true once some condition
+ * stops the search (e.g., we found a clean node before reaching
+ * max_count or we found a node belonging to another atom)
+ */
+ int stop;
+
+ /* The current scan position. If @node is non-NULL then its reference
+ count has been incremented to reflect this reference. */
+ jnode *node;
+
+ /* A handle for zload/zrelse of current scan position node. */
+ load_count node_load;
+
+ /* During left-scan, if the final position (a.k.a. endpoint node) is
+ formatted the node is locked using this lock handle. The endpoint
+ needs to be locked for transfer to the flush_position object after
+ scanning finishes. */
+ lock_handle node_lock;
+
+ /* When the position is unformatted, its parent, coordinate, and parent
+ zload/zrelse handle. */
+ lock_handle parent_lock;
+ coord_t parent_coord;
+ load_count parent_load;
+};
+
+struct convert_item_info {
+ dc_item_stat d_cur; /* per-cluster status of the current item */
+ dc_item_stat d_next; /* per-cluster status of the first item on
+ the right neighbor */
+ int cluster_shift; /* disk cluster shift */
+ flow_t flow; /* disk cluster data */
+};
+
+struct convert_info {
+ int count; /* for squalloc terminating */
+ item_plugin *iplug; /* current item plugin */
+ struct convert_item_info *itm; /* current item info */
+ struct cluster_handle clust; /* transform cluster */
+ lock_handle right_lock; /* lock handle of the right neighbor */
+ int right_locked;
+};
+
+typedef enum flush_position_state {
+ POS_INVALID, /* Invalid or stopped pos, do not continue slum
+ * processing */
+ POS_ON_LEAF, /* pos points to already prepped, locked
+ * formatted node at leaf level */
+ POS_ON_EPOINT, /* pos keeps a lock on twig level, "coord" field
+ * is used to traverse unformatted nodes */
+ POS_TO_LEAF, /* pos is being moved to leaf level */
+ POS_TO_TWIG, /* pos is being moved to twig level */
+ POS_END_OF_TWIG, /* special case of POS_ON_TWIG, when coord is
+ * after rightmost unit of the current twig */
+ POS_ON_INTERNAL /* same as POS_ON_LEAF, but points to internal
+ * node */
+} flushpos_state_t;
+
+/* An encapsulation of the current flush point and all the parameters that are
+ passed through the entire squeeze-and-allocate stage of the flush routine.
+ A single flush_position object is constructed after left- and right-scanning
+ finishes. */
+struct flush_position {
+ struct rb_root bricks_info;
+ struct flush_brick_info mfbi; /* pre-allocated info for meta-data brick */
+
+ flushpos_state_t state;
+ coord_t coord; /* coord to traverse unformatted nodes */
+ lock_handle lock; /* current lock we hold */
+ load_count load; /* load status for current locked formatted node
+ */
+ jnode *child; /* for passing a reference to unformatted child
+ * across pos state changes */
+
+ int alloc_cnt; /* The number of nodes allocated during squeeze
+ and allococate. */
+ int prep_or_free_cnt; /* The number of nodes prepared for write
+ (allocate) or squeezed and freed. */
+ flush_queue_t *fq;
+ long *nr_written; /* number of nodes submitted to disk */
+ int flags; /* a copy of jnode_flush flags argument */
+
+ znode *prev_twig; /* previous parent pointer value, used to catch
+ * processing of new twig node */
+ struct convert_info *sq; /* convert info */
+
+ unsigned long pos_in_unit; /* for extents only. Position
+ within an extent unit of first
+ jnode of slum */
+ long nr_to_write; /* number of unformatted nodes to handle on
+ flush */
+};
+
+static inline int item_convert_count(flush_pos_t *pos)
+{
+ return pos->sq->count;
+}
+static inline void inc_item_convert_count(flush_pos_t *pos)
+{
+ pos->sq->count++;
+}
+static inline void set_item_convert_count(flush_pos_t *pos, int count)
+{
+ pos->sq->count = count;
+}
+static inline item_plugin *item_convert_plug(flush_pos_t *pos)
+{
+ return pos->sq->iplug;
+}
+
+static inline struct convert_info *convert_data(flush_pos_t *pos)
+{
+ return pos->sq;
+}
+
+static inline struct convert_item_info *item_convert_data(flush_pos_t *pos)
+{
+ assert("edward-955", convert_data(pos));
+ return pos->sq->itm;
+}
+
+static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos)
+{
+ return &pos->sq->clust.tc;
+}
+
+static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos,
+ tfm_stream_id id)
+{
+ assert("edward-854", pos->sq != NULL);
+ return get_tfm_stream(tfm_cluster_sq(pos), id);
+}
+
+static inline int convert_data_attached(flush_pos_t *pos)
+{
+ return convert_data(pos) != NULL && item_convert_data(pos) != NULL;
+}
+
+#define should_convert_right_neighbor(pos) convert_data_attached(pos)
+
+/* Returns true if next node contains next item of the disk cluster
+ so item convert data should be moved to the right slum neighbor.
+*/
+static inline int next_node_is_chained(flush_pos_t *pos)
+{
+ return convert_data_attached(pos) &&
+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM;
+}
+
+/*
+ * Update "twin state" (d_cur, d_next) to assign a proper
+ * conversion mode in the next iteration of convert_node()
+ */
+static inline void update_chaining_state(flush_pos_t *pos,
+ int this_node /* where to proceed */)
+{
+
+ assert("edward-1010", convert_data_attached(pos));
+
+ if (this_node) {
+ /*
+ * we want to perform one more iteration with the same item
+ */
+ assert("edward-1013",
+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
+ assert("edward-1227",
+ item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
+
+ item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
+ }
+ else {
+ /*
+ * we want to proceed on right neighbor, which is chained
+ */
+ assert("edward-1011",
+ item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
+ item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
+ assert("edward-1012",
+ item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
+
+ item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
+ item_convert_data(pos)->d_next = DC_INVALID_STATE;
+ }
+}
+
+#define SQUALLOC_THRESHOLD 256
+
+static inline int should_terminate_squalloc(flush_pos_t *pos)
+{
+ return convert_data(pos) &&
+ !item_convert_data(pos) &&
+ item_convert_count(pos) >= SQUALLOC_THRESHOLD;
+}
+
+/**
+ * Make a decision about block relocation in a brick
+ */
+static inline int __leaf_should_relocate(flush_brick_info *fbi)
+{
+ /*
+ * relocate leaf nodes if at least FLUSH_RELOCATE_THRESHOLD
+ * nodes were found by left and right scan
+ */
+ return fbi->count >=
+ current_origin(fbi->brick_id)->flush.relocate_threshold;
+}
+
+#if REISER4_DEBUG
+#define check_convert_info(pos) \
+do { \
+ if (unlikely(should_convert_right_neighbor(pos))) { \
+ warning("edward-1006", "unprocessed chained data"); \
+ printk("d_cur = %d, d_next = %d, flow.len = %llu\n", \
+ item_convert_data(pos)->d_cur, \
+ item_convert_data(pos)->d_next, \
+ item_convert_data(pos)->flow.length); \
+ } \
+} while (0)
+#else
+#define check_convert_info(pos)
+#endif /* REISER4_DEBUG */
+
+void free_convert_data(flush_pos_t *pos);
+/* used in extent.c */
+int move_scan_pos(flush_scan *scan, jnode *node, unsigned add_size,
+ const coord_t *parent);
+int reiser4_scan_finished(flush_scan * scan);
+int reiser4_scanning_left(flush_scan * scan);
+int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
+txn_atom *atom_locked_by_fq(flush_queue_t *fq);
+int reiser4_alloc_extent(flush_pos_t *flush_pos);
+squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
+ reiser4_key *stop_key);
+extern int reiser4_init_fqs(void);
+extern void reiser4_done_fqs(void);
+extern reiser4_blocknr_hint *flush_pos_get_hint(flush_pos_t *pos, u32 subv_id,
+ reiser4_blocknr_hint *ehint);
+extern int leaf_should_relocate(flush_pos_t *pos, u32 subv_id);
+extern void flush_pos_update_preceder(flush_pos_t *pos, u32 subv_id,
+ reiser4_block_nr blk);
+extern int flush_init_static(void);
+extern void done_flush_static(void);
+
+#if REISER4_DEBUG
+extern void reiser4_check_fq(const txn_atom *atom);
+extern atomic_t flush_cnt;
+
+#define check_preceder(blk, subv) \
+assert("nikita-2588", blk < reiser4_subvol_block_count(subv));
+extern void check_pos(flush_pos_t *pos);
+#else
+#define check_preceder(blk, subv) noop
+#define check_pos(pos) noop
+#endif
+
+static inline void fbi_update_preceder(flush_brick_info *fbi,
+ reiser4_block_nr blk)
+{
+ if (unlikely(blk == current_origin(fbi->brick_id)->block_count))
+ /*
+ * we reached end of device, reset preceder
+ */
+ blk = 0;
+ fbi->preceder.blk = blk;
+ check_preceder(blk, current_origin(fbi->brick_id));
+}
+
+/* __REISER4_FLUSH_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 90
+ LocalWords: preceder
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/flush_queue.c linux-5.10.2/fs/reiser4/flush_queue.c
--- linux-5.10.2.orig/fs/reiser4/flush_queue.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/flush_queue.c 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,735 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+#include "debug.h"
+#include "super.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "vfs_ops.h"
+#include "writeout.h"
+#include "flush.h"
+
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
+#include <linux/writeback.h>
+
+/* A flush queue object is an accumulator for keeping jnodes prepared
+ by the jnode_flush() function for writing to disk. Those "queued" jnodes are
+ kept on the flush queue until memory pressure or atom commit asks
+ flush queues to write some or all from their jnodes. */
+
+/*
+ LOCKING:
+
+ fq->guard spin lock protects fq->atom pointer and nothing else. fq->prepped
+ list protected by atom spin lock. fq->prepped list uses the following
+ locking:
+
+ two ways to protect fq->prepped list for read-only list traversal:
+
+ 1. atom spin-lock atom.
+ 2. fq is IN_USE, atom->nr_running_queues increased.
+
+ and one for list modification:
+
+ 1. atom is spin-locked and one condition is true: fq is IN_USE or
+ atom->nr_running_queues == 0.
+
+ The deadlock-safe order for flush queues and atoms is: first lock atom, then
+ lock flush queue, then lock jnode.
+*/
+
+#define fq_in_use(fq) ((fq)->state & FQ_IN_USE)
+#define fq_ready(fq) (!fq_in_use(fq))
+
+#define mark_fq_in_use(fq) do { (fq)->state |= FQ_IN_USE; } while (0)
+#define mark_fq_ready(fq) do { (fq)->state &= ~FQ_IN_USE; } while (0)
+
+/* get lock on atom from locked flush queue object */
+static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq)
+{
+ /* This code is similar to jnode_get_atom(), look at it for the
+ * explanation. */
+ txn_atom *atom;
+
+ assert_spin_locked(&(fq->guard));
+
+ while (1) {
+ atom = fq->atom;
+ if (atom == NULL)
+ break;
+
+ if (spin_trylock_atom(atom))
+ break;
+
+ atomic_inc(&atom->refcount);
+ spin_unlock(&(fq->guard));
+ spin_lock_atom(atom);
+ spin_lock(&(fq->guard));
+
+ if (fq->atom == atom) {
+ atomic_dec(&atom->refcount);
+ break;
+ }
+
+ spin_unlock(&(fq->guard));
+ atom_dec_and_unlock(atom);
+ spin_lock(&(fq->guard));
+ }
+
+ return atom;
+}
+
+txn_atom *atom_locked_by_fq(flush_queue_t *fq)
+{
+ txn_atom *atom;
+
+ spin_lock(&(fq->guard));
+ atom = atom_locked_by_fq_nolock(fq);
+ spin_unlock(&(fq->guard));
+ return atom;
+}
+
+static void init_fq(flush_queue_t *fq)
+{
+ memset(fq, 0, sizeof *fq);
+
+ atomic_set(&fq->nr_submitted, 0);
+
+ INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
+
+ init_waitqueue_head(&fq->wait);
+ spin_lock_init(&fq->guard);
+}
+
+/* slab for flush queues */
+static struct kmem_cache *fq_slab;
+
+/**
+ * reiser4_init_fqs - create flush queue cache
+ *
+ * Initializes slab cache of flush queues. It is part of reiser4 module
+ * initialization.
+ */
+int reiser4_init_fqs(void)
+{
+ fq_slab = kmem_cache_create("fq",
+ sizeof(flush_queue_t),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (fq_slab == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+/**
+ * reiser4_done_fqs - delete flush queue cache
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_fqs(void)
+{
+ destroy_reiser4_cache(&fq_slab);
+}
+
+/* create new flush queue object */
+static flush_queue_t *create_fq(gfp_t gfp)
+{
+ flush_queue_t *fq;
+
+ fq = kmem_cache_alloc(fq_slab, gfp);
+ if (fq)
+ init_fq(fq);
+
+ return fq;
+}
+
+/* adjust atom's and flush queue's counters of queued nodes */
+static void count_enqueued_node(flush_queue_t *fq)
+{
+ ON_DEBUG(fq->atom->num_queued++);
+}
+
+static void count_dequeued_node(flush_queue_t *fq)
+{
+ assert("zam-993", fq->atom->num_queued > 0);
+ ON_DEBUG(fq->atom->num_queued--);
+}
+
+/* attach flush queue object to the atom */
+static void attach_fq(txn_atom *atom, flush_queue_t *fq)
+{
+ assert_spin_locked(&(atom->alock));
+ list_add(&fq->alink, &atom->flush_queues);
+ fq->atom = atom;
+ ON_DEBUG(atom->nr_flush_queues++);
+}
+
+static void detach_fq(flush_queue_t *fq)
+{
+ assert_spin_locked(&(fq->atom->alock));
+
+ spin_lock(&(fq->guard));
+ list_del_init(&fq->alink);
+ assert("vs-1456", fq->atom->nr_flush_queues > 0);
+ ON_DEBUG(fq->atom->nr_flush_queues--);
+ fq->atom = NULL;
+ spin_unlock(&(fq->guard));
+}
+
+/* destroy flush queue object */
+static void done_fq(flush_queue_t *fq)
+{
+ assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
+ assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
+
+ kmem_cache_free(fq_slab, fq);
+}
+
+static void mark_jnode_queued(flush_queue_t *fq, jnode * node)
+{
+ JF_SET(node, JNODE_FLUSH_QUEUED);
+ count_enqueued_node(fq);
+}
+
+/**
+ * Move jnode to the flush queue.
+ * Both atom and jnode should be spin-locked
+ */
+void queue_jnode(flush_queue_t *fq, jnode *node)
+{
+ assert_spin_locked(&(node->guard));
+ assert("zam-713", node->atom != NULL);
+ assert_spin_locked(&(node->atom->alock));
+ assert("zam-716", fq->atom != NULL);
+ assert("zam-717", fq->atom == node->atom);
+ assert("zam-907", fq_in_use(fq));
+
+ assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
+ assert("zam-826", JF_ISSET(node, JNODE_RELOC));
+ assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
+ assert("vs-1481", NODE_LIST(node) != FQ_LIST);
+
+ assert("edward-2321", !reiser4_blocknr_is_fake(jnode_get_block(node)));
+
+ mark_jnode_queued(fq, node);
+ list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
+
+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
+ FQ_LIST, 1));
+}
+
+/**
+ * Repeatable process for waiting io completion on a flush queue object
+ */
+static int wait_io(flush_queue_t *fq, int *nr_io_errors)
+{
+ assert("zam-738", fq->atom != NULL);
+ assert_spin_locked(&(fq->atom->alock));
+ assert("zam-736", fq_in_use(fq));
+ assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
+
+ if (atomic_read(&fq->nr_submitted) != 0) {
+ struct super_block *super;
+
+ spin_unlock_atom(fq->atom);
+
+ assert("nikita-3013", reiser4_schedulable());
+
+ super = reiser4_get_current_sb();
+
+ //blk_run_queues();
+ //blk_flush_plug(current);
+
+ if (!sb_rdonly(super))
+ wait_event(fq->wait,
+ atomic_read(&fq->nr_submitted) == 0);
+ /*
+ * Ask the caller to re-acquire the locks and call this
+ * function again. Note: this technique is commonly used
+ * in the txnmgr code
+ */
+ return -E_REPEAT;
+ }
+ *nr_io_errors += atomic_read(&fq->nr_errors);
+ return 0;
+}
+
+/**
+ * Wait on I/O completion, re-submit dirty nodes to write
+ */
+static int finish_fq(flush_queue_t *fq, int *nr_io_errors)
+{
+ int ret;
+ txn_atom *atom = fq->atom;
+
+ assert("zam-801", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+ assert("zam-762", fq_in_use(fq));
+
+ ret = wait_io(fq, nr_io_errors);
+ if (ret)
+ return ret;
+
+ detach_fq(fq);
+ done_fq(fq);
+
+ reiser4_atom_send_event(atom);
+
+ return 0;
+}
+
+/**
+ * Wait for all IOs for given atom to be completed.
+ * Actually do one iteration on that and return -E_REPEAT,
+ * if there more iterations needed
+ */
+static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
+{
+ flush_queue_t *fq;
+
+ assert_spin_locked(&(atom->alock));
+
+ if (list_empty_careful(&atom->flush_queues))
+ return 0;
+
+ list_for_each_entry(fq, &atom->flush_queues, alink) {
+ if (fq_ready(fq)) {
+ int ret;
+
+ mark_fq_in_use(fq);
+ assert("vs-1247", fq->owner == NULL);
+ ON_DEBUG(fq->owner = current);
+ ret = finish_fq(fq, nr_io_errors);
+
+ if (*nr_io_errors)
+ reiser4_handle_error();
+
+ if (ret) {
+ reiser4_fq_put(fq);
+ return ret;
+ }
+ spin_unlock_atom(atom);
+
+ return -E_REPEAT;
+ }
+ }
+ /*
+ * All flush queues are in use; atom remains locked
+ */
+ return -EBUSY;
+}
+
+/**
+ * Wait all IOs for current atom
+ */
+int current_atom_finish_all_fq(void)
+{
+ txn_atom *atom;
+ int nr_io_errors = 0;
+ int ret = 0;
+
+ do {
+ while (1) {
+ atom = get_current_atom_locked();
+ ret = finish_all_fq(atom, &nr_io_errors);
+ if (ret != -EBUSY)
+ break;
+ reiser4_atom_wait_event(atom);
+ }
+ } while (ret == -E_REPEAT);
+ /*
+ * we do not need locked atom after this function finishes,
+ * SUCCESS or -EBUSY are two return codes when atom remains
+ * locked after finish_all_fq
+ */
+ if (!ret)
+ spin_unlock_atom(atom);
+
+ assert_spin_not_locked(&(atom->alock));
+
+ if (ret)
+ return ret;
+
+ if (nr_io_errors)
+ return RETERR(-EIO);
+
+ return 0;
+}
+
+/**
+ * Change node->atom field for all jnode from given list
+ */
+static void scan_fq_and_update_atom_ref(struct list_head *list,
+ txn_atom *atom)
+{
+ jnode *cur;
+
+ list_for_each_entry(cur, list, capture_link) {
+ spin_lock_jnode(cur);
+ cur->atom = atom;
+ spin_unlock_jnode(cur);
+ }
+}
+
+/**
+ * Support for atom fusion operation
+ */
+void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
+{
+ flush_queue_t *fq;
+
+ assert_spin_locked(&(to->alock));
+ assert_spin_locked(&(from->alock));
+
+ list_for_each_entry(fq, &from->flush_queues, alink) {
+ scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
+ spin_lock(&(fq->guard));
+ fq->atom = to;
+ spin_unlock(&(fq->guard));
+ }
+
+ list_splice_init(&from->flush_queues, to->flush_queues.prev);
+
+#if REISER4_DEBUG
+ to->num_queued += from->num_queued;
+ to->nr_flush_queues += from->nr_flush_queues;
+ from->nr_flush_queues = 0;
+#endif
+}
+
+#if REISER4_DEBUG
+int atom_fq_parts_are_clean(txn_atom * atom)
+{
+ assert("zam-915", atom != NULL);
+ return list_empty_careful(&atom->flush_queues);
+}
+#endif
+
+/**
+ * Bio i/o completion routine for reiser4 write operations
+ */
+static void end_io_handler(struct bio *bio)
+{
+ int nr = 0;
+ int nr_errors = 0;
+ flush_queue_t *fq;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ assert("zam-958", bio_op(bio) == WRITE);
+ /*
+ * We expect that bio->private is set to NULL, or
+ * to fq object which is used for synchronization
+ * and error counting
+ */
+ fq = bio->bi_private;
+ /*
+ * Check all elements of io_vec for correct write completion
+ */
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *pg = bvec->bv_page;
+
+ if (bio->bi_status) {
+ SetPageError(pg);
+ nr_errors++;
+ }
+
+ {
+ /* jnode WRITEBACK ("write is in progress bit") is
+ * atomically cleared here. */
+ jnode *node;
+
+ assert("zam-736", pg != NULL);
+ assert("zam-736", PagePrivate(pg));
+ node = jprivate(pg);
+
+ JF_CLR(node, JNODE_WRITEBACK);
+ }
+ nr ++;
+ end_page_writeback(pg);
+ put_page(pg);
+ }
+ if (fq) {
+ /*
+ * count i/o error in fq object
+ */
+ atomic_add(nr_errors, &fq->nr_errors);
+ /*
+ * If all write requests registered in this "fq" are done
+ * we up the waiter
+ */
+ if (atomic_sub_and_test(nr, &fq->nr_submitted))
+ wake_up(&fq->wait);
+ }
+ bio_put(bio);
+}
+
+/**
+ * Count I/O requests which will be submitted by @bio
+ * in the given flush queue @fq
+ */
+void add_fq_to_bio(flush_queue_t *fq, struct bio *bio)
+{
+ bio->bi_private = fq;
+ bio->bi_end_io = end_io_handler;
+
+ if (fq)
+ atomic_add(bio->bi_iter.bi_size >> PAGE_SHIFT,
+ &fq->nr_submitted);
+}
+
+/**
+ * Move all queued nodes out from @fq->prepped list
+ */
+static void release_prepped_list(flush_queue_t *fq)
+{
+ txn_atom *atom;
+
+ assert("zam-904", fq_in_use(fq));
+ atom = atom_locked_by_fq(fq);
+
+ while (!list_empty(ATOM_FQ_LIST(fq))) {
+ jnode *cur;
+
+ cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
+ list_del_init(&cur->capture_link);
+
+ count_dequeued_node(fq);
+ spin_lock_jnode(cur);
+ assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
+ assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
+ assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
+ JF_CLR(cur, JNODE_FLUSH_QUEUED);
+
+ if (JF_ISSET(cur, JNODE_DIRTY)) {
+ list_add_tail(&cur->capture_link,
+ ATOM_DIRTY_LIST(atom,
+ jnode_get_level(cur)));
+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
+ DIRTY_LIST, 1));
+ } else {
+ list_add_tail(&cur->capture_link,
+ ATOM_CLEAN_LIST(atom));
+ ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
+ CLEAN_LIST, 1));
+ }
+
+ spin_unlock_jnode(cur);
+ }
+
+ if (--atom->nr_running_queues == 0)
+ reiser4_atom_send_event(atom);
+
+ spin_unlock_atom(atom);
+}
+
+static int fq_compare_jnode(void* priv UNUSED_ARG,
+ struct list_head *a, struct list_head *b)
+{
+ jnode *ja, *jb;
+
+ assert("edward-1873", a != NULL);
+ assert("edward-1874", b != NULL);
+
+ ja = jnode_by_link(a);
+ jb = jnode_by_link(b);
+
+ if (jnode_get_subvol(ja)->id < jnode_get_subvol(jb)->id)
+ return -1;
+ if (jnode_get_subvol(ja)->id > jnode_get_subvol(jb)->id)
+ return 1;
+ if (jnode_get_block(ja) < jnode_get_block(jb))
+ return -1;
+ return 1;
+}
+
+/**
+ * Submit write requests for nodes on the already filled flush queue @fq.
+ *
+ * @fq: flush queue object which contains jnodes we can (and will) write.
+ * @return: number of submitted blocks (>=0) if success, otherwise -- an
+ * error code (<0)
+ */
+int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags)
+{
+ int ret;
+ txn_atom *atom;
+
+ while (1) {
+ atom = atom_locked_by_fq(fq);
+ assert("zam-924", atom);
+ /*
+ * do not write fq in parallel
+ */
+ if (atom->nr_running_queues == 0 ||
+ !(flags & WRITEOUT_SINGLE_STREAM))
+ break;
+ reiser4_atom_wait_event(atom);
+ }
+ atom->nr_running_queues++;
+ spin_unlock_atom(atom);
+
+ list_sort(NULL, ATOM_FQ_LIST(fq), fq_compare_jnode);
+ ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
+ release_prepped_list(fq);
+ return ret;
+}
+
+/**
+ * Getting flush queue object for exclusive use by one thread. May require
+ * several iterations which is indicated by -E_REPEAT return code.
+ *
+ * This function does not contain code for obtaining an atom lock because an
+ * atom lock is obtained by different ways in different parts of reiser4,
+ * usually it is current atom, but we need a possibility for getting fq for
+ * the atom of given jnode.
+ */
+static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
+{
+ flush_queue_t *fq;
+
+ assert_spin_locked(&(atom->alock));
+
+ fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
+ while (&atom->flush_queues != &fq->alink) {
+ spin_lock(&(fq->guard));
+
+ if (fq_ready(fq)) {
+ mark_fq_in_use(fq);
+ assert("vs-1246", fq->owner == NULL);
+ ON_DEBUG(fq->owner = current);
+ spin_unlock(&(fq->guard));
+
+ if (*new_fq)
+ done_fq(*new_fq);
+ *new_fq = fq;
+ return 0;
+ }
+ spin_unlock(&(fq->guard));
+
+ fq = list_entry(fq->alink.next, flush_queue_t, alink);
+ }
+ /*
+ * Use previously allocated fq object
+ */
+ if (*new_fq) {
+ mark_fq_in_use(*new_fq);
+ assert("vs-1248", (*new_fq)->owner == 0);
+ ON_DEBUG((*new_fq)->owner = current);
+ attach_fq(atom, *new_fq);
+ return 0;
+ }
+ spin_unlock_atom(atom);
+
+ *new_fq = create_fq(gfp);
+
+ if (*new_fq == NULL)
+ return RETERR(-ENOMEM);
+ /*
+ * caller should re-acquire atom lock and call this function again
+ */
+ return RETERR(-E_REPEAT);
+}
+
+int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq)
+{
+ return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
+}
+
+/**
+ * A wrapper around reiser4_fq_by_atom for getting a flush queue
+ * object for current atom, if success fq->atom remains locked
+ */
+flush_queue_t *get_fq_for_current_atom(void)
+{
+ flush_queue_t *fq = NULL;
+ txn_atom *atom;
+ int ret;
+
+ do {
+ atom = get_current_atom_locked();
+ ret = reiser4_fq_by_atom(atom, &fq);
+ } while (ret == -E_REPEAT);
+
+ if (ret)
+ return ERR_PTR(ret);
+ return fq;
+}
+
+/**
+ * Releasing flush queue object after exclusive use
+ */
+void reiser4_fq_put_nolock(flush_queue_t *fq)
+{
+ assert("zam-747", fq->atom != NULL);
+ assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
+ mark_fq_ready(fq);
+ assert("vs-1245", fq->owner == current);
+ ON_DEBUG(fq->owner = NULL);
+}
+
+void reiser4_fq_put(flush_queue_t *fq)
+{
+ txn_atom *atom;
+
+ spin_lock(&(fq->guard));
+ atom = atom_locked_by_fq_nolock(fq);
+
+ assert("zam-746", atom != NULL);
+
+ reiser4_fq_put_nolock(fq);
+ reiser4_atom_send_event(atom);
+
+ spin_unlock(&(fq->guard));
+ spin_unlock_atom(atom);
+}
+
+/**
+ * A part of atom object initialization related to the
+ * embedded flush queue list head
+ */
+void init_atom_fq_parts(txn_atom *atom)
+{
+ INIT_LIST_HEAD(&atom->flush_queues);
+}
+
+#if REISER4_DEBUG
+void reiser4_check_fq(const txn_atom *atom)
+{
+ flush_queue_t *fq;
+ int count;
+ struct list_head *pos;
+ /*
+ * check number of nodes on all atom's flush queues
+ */
+ count = 0;
+ list_for_each_entry(fq, &atom->flush_queues, alink) {
+ spin_lock(&(fq->guard));
+ /*
+ * calculate number of jnodes on fq' list of prepped jnodes
+ */
+ list_for_each(pos, ATOM_FQ_LIST(fq))
+ count++;
+ spin_unlock(&(fq->guard));
+ }
+ if (count != atom->fq)
+ warning("", "fq counter %d, real %d\n", atom->fq, count);
+}
+#endif /* REISER4_DEBUG */
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/forward.h linux-5.10.2/fs/reiser4/forward.h
--- linux-5.10.2.orig/fs/reiser4/forward.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/forward.h 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,276 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Forward declarations. Thank you Kernighan. */
+
+#if !defined(__REISER4_FORWARD_H__)
+#define __REISER4_FORWARD_H__
+
+#include <asm/errno.h>
+#include <linux/types.h>
+
+typedef struct zlock zlock;
+typedef struct lock_stack lock_stack;
+typedef struct lock_handle lock_handle;
+typedef struct znode znode;
+typedef struct flow flow_t;
+typedef struct coord coord_t;
+typedef struct tree_access_pointer tap_t;
+typedef struct reiser4_object_create_data reiser4_object_create_data;
+typedef union reiser4_plugin reiser4_plugin;
+typedef __u16 reiser4_plugin_id;
+typedef __u64 reiser4_plugin_groups;
+typedef struct item_plugin item_plugin;
+typedef struct jnode_plugin jnode_plugin;
+typedef struct reiser4_item_data reiser4_item_data;
+typedef union reiser4_key reiser4_key;
+typedef struct reiser4_tree reiser4_tree;
+typedef struct carry_cut_data carry_cut_data;
+typedef struct carry_kill_data carry_kill_data;
+typedef struct carry_tree_op carry_tree_op;
+typedef struct carry_tree_node carry_tree_node;
+typedef struct carry_plugin_info carry_plugin_info;
+typedef struct reiser4_journal reiser4_journal;
+typedef struct txn_atom txn_atom;
+typedef struct txn_handle txn_handle;
+typedef struct txn_mgr txn_mgr;
+typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
+typedef struct reiser4_context reiser4_context;
+typedef struct atom_brick_info atom_brick_info;
+typedef struct ctx_brick_info ctx_brick_info;
+typedef struct flush_brick_info flush_brick_info;
+typedef struct carry_level carry_level;
+typedef struct blocknr_set_entry blocknr_set_entry;
+typedef struct blocknr_list_entry blocknr_list_entry;
+typedef struct reiser4_volinfo reiser4_volinfo;
+typedef struct reiser4_volume reiser4_volume;
+typedef struct reiser4_subvol reiser4_subvol;
+/* super_block->s_fs_info points to this */
+typedef struct reiser4_super_info_data reiser4_super_info_data;
+/* next two objects are fields of reiser4_super_info_data */
+typedef struct reiser4_oid_allocator reiser4_oid_allocator;
+typedef struct reiser4_space_allocator reiser4_space_allocator;
+
+typedef struct flush_scan flush_scan;
+typedef struct flush_position flush_pos_t;
+
+typedef unsigned short pos_in_node_t;
+
+typedef struct lv_conf lv_conf;
+typedef reiser4_subvol *mirror_t;
+typedef mirror_t *slot_t;
+typedef void *bucket_t;
+
+#define MAX_POS_IN_NODE 65535
+#define MAX_NUM_SUBVOLS 8
+
+typedef struct jnode jnode;
+typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
+
+typedef struct uf_coord uf_coord_t;
+typedef struct hint hint_t;
+
+typedef struct ktxnmgrd_context ktxnmgrd_context;
+typedef union reiser4_dcx reiser4_dcx;
+typedef struct fnode fnode;
+
+struct inode;
+struct page;
+struct file;
+struct dentry;
+struct super_block;
+
+/* return values of coord_by_key(). cbk == coord_by_key */
+typedef enum {
+ CBK_COORD_FOUND = 0,
+ CBK_COORD_NOTFOUND = -ENOENT,
+} lookup_result;
+
+/* results of lookup with directory file */
+typedef enum {
+ FILE_NAME_FOUND = 0,
+ FILE_NAME_NOTFOUND = -ENOENT,
+ FILE_IO_ERROR = -EIO, /* FIXME: it seems silly to have special OOM,
+ IO_ERROR return codes for each search. */
+ FILE_OOM = -ENOMEM /* FIXME: it seems silly to have special OOM,
+ IO_ERROR return codes for each search. */
+} file_lookup_result;
+
+/* behaviors of lookup. If coord we are looking for is actually in a tree,
+ both coincide. */
+typedef enum {
+ /* search exactly for the coord with key given */
+ FIND_EXACT,
+ /* search for coord with the maximal key not greater than one
+ given */
+ FIND_MAX_NOT_MORE_THAN /*LEFT_SLANT_BIAS */
+} lookup_bias;
+
+typedef enum {
+ /* number of leaf level of the tree
+ The fake root has (tree_level=0). */
+ LEAF_LEVEL = 1,
+
+ /* number of level one above leaf level of the tree.
+
+ It is supposed that internal tree used by reiser4 to store file
+ system data and meta data will have height 2 initially (when
+ created by mkfs).
+ */
+ TWIG_LEVEL = 2,
+} tree_level;
+
+/* The "real" maximum ztree height is the 0-origin size of any per-level
+ array, since the zero'th level is not used. */
+#define REAL_MAX_ZTREE_HEIGHT (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
+
+/* enumeration of possible mutual position of item and coord. This enum is
+ return type of ->is_in_item() item plugin method which see. */
+typedef enum {
+ /* coord is on the left of an item */
+ IP_ON_THE_LEFT,
+ /* coord is inside item */
+ IP_INSIDE,
+ /* coord is inside item, but to the right of the rightmost unit of
+ this item */
+ IP_RIGHT_EDGE,
+ /* coord is on the right of an item */
+ IP_ON_THE_RIGHT
+} interposition;
+
+/* type of lock to acquire on znode before returning it to caller */
+typedef enum {
+ ZNODE_NO_LOCK = 0,
+ ZNODE_READ_LOCK = 1,
+ ZNODE_WRITE_LOCK = 2,
+} znode_lock_mode;
+
+/* type of lock request */
+typedef enum {
+ ZNODE_LOCK_LOPRI = 0,
+ ZNODE_LOCK_HIPRI = (1 << 0),
+
+ /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to
+ longterm_lock_znode will not sleep waiting for the lock to become
+ available. If the lock is unavailable, reiser4_znode_lock will
+ immediately return the value -E_REPEAT. */
+ ZNODE_LOCK_NONBLOCK = (1 << 1),
+ /* An option for longterm_lock_znode which prevents atom fusion */
+ ZNODE_LOCK_DONT_FUSE = (1 << 2)
+} znode_lock_request;
+
+typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
+
+/* used to specify direction of shift. These must be -1 and 1 */
+typedef enum {
+ SHIFT_LEFT = 1,
+ SHIFT_RIGHT = -1
+} shift_direction;
+
+typedef enum {
+ LEFT_SIDE,
+ RIGHT_SIDE
+} sideof;
+
+#define reiser4_round_up(value, order) \
+ ((typeof(value))(((long) (value) + (order) - 1U) & \
+ ~((order) - 1)))
+
+/* values returned by squalloc_right_neighbor and its auxiliary functions */
+typedef enum {
+ /* unit of internal item is moved */
+ SUBTREE_MOVED = 0,
+ /* nothing else can be squeezed into left neighbor */
+ SQUEEZE_TARGET_FULL = 1,
+ /* all content of node is squeezed into its left neighbor */
+ SQUEEZE_SOURCE_EMPTY = 2,
+ /* one more item is copied (this is only returned by
+ allocate_and_copy_extent to squalloc_twig)) */
+ SQUEEZE_CONTINUE = 3
+} squeeze_result;
+
+/* Do not change items ids. If you do - there will be format change */
+typedef enum {
+ STATIC_STAT_DATA_ID = 0x0,
+ SIMPLE_DIR_ENTRY_ID = 0x1,
+ COMPOUND_DIR_ID = 0x2,
+ NODE_POINTER_ID = 0x3,
+ EXTENT40_POINTER_ID = 0x5,
+ FORMATTING_ID = 0x6,
+ CTAIL_ID = 0x7,
+ BLACK_BOX_ID = 0x8,
+ EXTENT41_POINTER_ID = 0x9,
+ BRICK_SYMBOL_ID = 0xa,
+ LAST_ITEM_ID = 0xb,
+} item_id;
+
+/* Flags passed to jnode_flush() to allow it to distinguish default settings
+ based on whether commit() was called or VM memory pressure was applied. */
+typedef enum {
+ /* submit flush queue to disk at jnode_flush completion */
+ JNODE_FLUSH_WRITE_BLOCKS = 1,
+
+ /* flush is called for commit */
+ JNODE_FLUSH_COMMIT = 2,
+ /* not implemented */
+ JNODE_FLUSH_MEMORY_FORMATTED = 4,
+
+ /* not implemented */
+ JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
+} jnode_flush_flags;
+
+/* Flags to insert/paste carry operations. Currently they only used in
+ flushing code, but in future, they can be used to optimize for repetitive
+ accesses. */
+typedef enum {
+ /* carry is not allowed to shift data to the left when trying to find
+ free space */
+ COPI_DONT_SHIFT_LEFT = (1 << 0),
+ /* carry is not allowed to shift data to the right when trying to find
+ free space */
+ COPI_DONT_SHIFT_RIGHT = (1 << 1),
+ /* carry is not allowed to allocate new node(s) when trying to find
+ free space */
+ COPI_DONT_ALLOCATE = (1 << 2),
+ /* try to load left neighbor if its not in a cache */
+ COPI_LOAD_LEFT = (1 << 3),
+ /* try to load right neighbor if its not in a cache */
+ COPI_LOAD_RIGHT = (1 << 4),
+ /* shift insertion point to the left neighbor */
+ COPI_GO_LEFT = (1 << 5),
+ /* shift insertion point to the right neighbor */
+ COPI_GO_RIGHT = (1 << 6),
+ /* try to step back into original node if insertion into new node
+ fails after shifting data there. */
+ COPI_STEP_BACK = (1 << 7),
+ /* use all possible space in the node */
+ COPI_SWEEP = (1 << 8)
+} cop_insert_flag;
+
+typedef enum {
+ SAFE_UNLINK, /* safe-link for unlink */
+ SAFE_TRUNCATE /* safe-link for truncate */
+} reiser4_safe_link_t;
+
+/* this is to show on which list of atom jnode is */
+typedef enum {
+ NOT_CAPTURED,
+ DIRTY_LIST,
+ CLEAN_LIST,
+ FQ_LIST,
+ WB_LIST,
+ OVRWR_LIST
+} atom_list;
+
+/* __REISER4_FORWARD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/fsdata.c linux-5.10.2/fs/reiser4/fsdata.c
--- linux-5.10.2.orig/fs/reiser4/fsdata.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/fsdata.c 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,801 @@
+/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "fsdata.h"
+#include "inode.h"
+
+#include <linux/shrinker.h>
+
+/* cache or dir_cursors */
+static struct kmem_cache *d_cursor_cache;
+
+/* list of unused cursors */
+static LIST_HEAD(cursor_cache);
+
+/* number of cursors in list of ununsed cursors */
+static unsigned long d_cursor_unused = 0;
+
+/* spinlock protecting manipulations with dir_cursor's hash table and lists */
+DEFINE_SPINLOCK(d_c_lock);
+
+static reiser4_file_fsdata *create_fsdata(struct file *file);
+static int file_is_stateless(struct file *file);
+static void free_fsdata(reiser4_file_fsdata *fsdata);
+static void kill_cursor(dir_cursor *);
+
+static unsigned long d_cursor_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ dir_cursor *scan;
+ unsigned long freed = 0;
+
+ spin_lock(&d_c_lock);
+ while (!list_empty(&cursor_cache) && sc->nr_to_scan) {
+ scan = list_entry(cursor_cache.next, dir_cursor, alist);
+ assert("nikita-3567", scan->ref == 0);
+ kill_cursor(scan);
+ freed++;
+ sc->nr_to_scan--;
+ }
+ spin_unlock(&d_c_lock);
+ return freed;
+}
+
+static unsigned long d_cursor_shrink_count (struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return d_cursor_unused;
+}
+
+/*
+ * actually, d_cursors are "priceless", because there is no way to
+ * recover information stored in them. On the other hand, we don't
+ * want to consume all kernel memory by them. As a compromise, just
+ * assign higher "seeks" value to d_cursor cache, so that it will be
+ * shrunk only if system is really tight on memory.
+ */
+static struct shrinker d_cursor_shrinker = {
+ .count_objects = d_cursor_shrink_count,
+ .scan_objects = d_cursor_shrink_scan,
+ .seeks = DEFAULT_SEEKS << 3
+};
+
+/**
+ * reiser4_init_d_cursor - create d_cursor cache
+ *
+ * Initializes slab cache of d_cursors. It is part of reiser4 module
+ * initialization.
+ */
+int reiser4_init_d_cursor(void)
+{
+ d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (d_cursor_cache == NULL)
+ return RETERR(-ENOMEM);
+
+ register_shrinker(&d_cursor_shrinker);
+ return 0;
+}
+
+/**
+ * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_d_cursor(void)
+{
+ unregister_shrinker(&d_cursor_shrinker);
+
+ destroy_reiser4_cache(&d_cursor_cache);
+}
+
+#define D_CURSOR_TABLE_SIZE (256)
+
+static inline unsigned long
+d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key)
+{
+ assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
+ return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
+}
+
+static inline int d_cursor_eq(const struct d_cursor_key *k1,
+ const struct d_cursor_key *k2)
+{
+ return k1->cid == k2->cid && k1->oid == k2->oid;
+}
+
+/*
+ * define functions to manipulate reiser4 super block's hash table of
+ * dir_cursors
+ */
+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
+#define KFREE(ptr, size) kfree(ptr)
+TYPE_SAFE_HASH_DEFINE(d_cursor,
+ dir_cursor,
+ struct d_cursor_key,
+ key, hash, d_cursor_hash, d_cursor_eq);
+#undef KFREE
+#undef KMALLOC
+
+/**
+ * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
+ * @super: super block to initialize
+ *
+ * Initializes per-super-block d_cursor's hash table and radix tree. It is part
+ * of mount.
+ */
+int reiser4_init_super_d_info(struct super_block *super)
+{
+ struct d_cursor_info *p;
+
+ p = &get_super_private(super)->d_info;
+
+ INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
+ return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
+}
+
+/**
+ * reiser4_done_super_d_info - release per-super-block d_cursor resources
+ * @super: super block being umounted
+ *
+ * It is called on umount. Kills all directory cursors attached to suoer block.
+ */
+void reiser4_done_super_d_info(struct super_block *super)
+{
+ struct d_cursor_info *d_info;
+ dir_cursor *cursor, *next;
+
+ d_info = &get_super_private(super)->d_info;
+ for_all_in_htable(&d_info->table, d_cursor, cursor, next)
+ kill_cursor(cursor);
+
+ BUG_ON(!radix_tree_empty(&d_info->tree));
+ d_cursor_hash_done(&d_info->table);
+}
+
+/**
+ * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
+ * @cursor: cursor to free
+ *
+ * Removes reiser4_file_fsdata attached to @cursor from readdir list of
+ * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
+ * indices, hash table, list of unused cursors and frees it.
+ */
+static void kill_cursor(dir_cursor *cursor)
+{
+ unsigned long index;
+
+ assert("nikita-3566", cursor->ref == 0);
+ assert("nikita-3572", cursor->fsdata != NULL);
+
+ index = (unsigned long)cursor->key.oid;
+ list_del_init(&cursor->fsdata->dir.linkage);
+ free_fsdata(cursor->fsdata);
+ cursor->fsdata = NULL;
+
+ if (list_empty_careful(&cursor->list))
+ /* this is last cursor for a file. Kill radix-tree entry */
+ radix_tree_delete(&cursor->info->tree, index);
+ else {
+ void **slot;
+
+ /*
+ * there are other cursors for the same oid.
+ */
+
+ /*
+ * if radix tree point to the cursor being removed, re-target
+ * radix tree slot to the next cursor in the (non-empty as was
+ * checked above) element of the circular list of all cursors
+ * for this oid.
+ */
+ slot = radix_tree_lookup_slot(&cursor->info->tree, index);
+ assert("nikita-3571", *slot != NULL);
+ if (*slot == cursor)
+ *slot = list_entry(cursor->list.next, dir_cursor, list);
+ /* remove cursor from circular list */
+ list_del_init(&cursor->list);
+ }
+ /* remove cursor from the list of unused cursors */
+ list_del_init(&cursor->alist);
+ /* remove cursor from the hash table */
+ d_cursor_hash_remove(&cursor->info->table, cursor);
+ /* and free it */
+ kmem_cache_free(d_cursor_cache, cursor);
+ --d_cursor_unused;
+}
+
+/* possible actions that can be performed on all cursors for the given file */
+enum cursor_action {
+ /*
+ * load all detached state: this is called when stat-data is loaded
+ * from the disk to recover information about all pending readdirs
+ */
+ CURSOR_LOAD,
+ /*
+ * detach all state from inode, leaving it in the cache. This is called
+ * when inode is removed form the memory by memory pressure
+ */
+ CURSOR_DISPOSE,
+ /*
+ * detach cursors from the inode, and free them. This is called when
+ * inode is destroyed
+ */
+ CURSOR_KILL
+};
+
+/*
+ * return d_cursor data for the file system @inode is in.
+ */
+static inline struct d_cursor_info *d_info(struct inode *inode)
+{
+ return &get_super_private(inode->i_sb)->d_info;
+}
+
+/*
+ * lookup d_cursor in the per-super-block radix tree.
+ */
+static inline dir_cursor *lookup(struct d_cursor_info *info,
+ unsigned long index)
+{
+ return (dir_cursor *) radix_tree_lookup(&info->tree, index);
+}
+
+/*
+ * attach @cursor to the radix tree. There may be multiple cursors for the
+ * same oid, they are chained into circular list.
+ */
+static void bind_cursor(dir_cursor * cursor, unsigned long index)
+{
+ dir_cursor *head;
+
+ head = lookup(cursor->info, index);
+ if (head == NULL) {
+ /* this is the first cursor for this index */
+ INIT_LIST_HEAD(&cursor->list);
+ radix_tree_insert(&cursor->info->tree, index, cursor);
+ } else {
+ /* some cursor already exists. Chain ours */
+ list_add(&cursor->list, &head->list);
+ }
+}
+
+/*
+ * detach fsdata (if detachable) from file descriptor, and put cursor on the
+ * "unused" list. Called when file descriptor is not longer in active use.
+ */
+static void clean_fsdata(struct file *file)
+{
+ dir_cursor *cursor;
+ reiser4_file_fsdata *fsdata;
+
+ assert("nikita-3570", file_is_stateless(file));
+
+ fsdata = (reiser4_file_fsdata *) file->private_data;
+ if (fsdata != NULL) {
+ cursor = fsdata->cursor;
+ if (cursor != NULL) {
+ spin_lock(&d_c_lock);
+ --cursor->ref;
+ if (cursor->ref == 0) {
+ list_add_tail(&cursor->alist, &cursor_cache);
+ ++d_cursor_unused;
+ }
+ spin_unlock(&d_c_lock);
+ file->private_data = NULL;
+ }
+ }
+}
+
+/*
+ * global counter used to generate "client ids". These ids are encoded into
+ * high bits of fpos.
+ */
+static __u32 cid_counter = 0;
+#define CID_SHIFT (20)
+#define CID_MASK (0xfffffull)
+
+static void free_file_fsdata_nolock(struct file *);
+
+/**
+ * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
+ * @cursor:
+ * @file:
+ * @inode:
+ *
+ * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
+ * reiser4 super block's hash table and radix tree.
+ add detachable readdir
+ * state to the @f
+ */
+static int insert_cursor(dir_cursor *cursor, struct file *file, loff_t *fpos,
+ struct inode *inode)
+{
+ int result;
+ reiser4_file_fsdata *fsdata;
+
+ memset(cursor, 0, sizeof *cursor);
+
+ /* this is either first call to readdir, or rewind. Anyway, create new
+ * cursor. */
+ fsdata = create_fsdata(NULL);
+ if (fsdata != NULL) {
+ result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
+ if (result == 0) {
+ struct d_cursor_info *info;
+ oid_t oid;
+
+ info = d_info(inode);
+ oid = get_inode_oid(inode);
+ /* cid occupies higher 12 bits of f->f_pos. Don't
+ * allow it to become negative: this confuses
+ * nfsd_readdir() */
+ cursor->key.cid = (++cid_counter) & 0x7ff;
+ cursor->key.oid = oid;
+ cursor->fsdata = fsdata;
+ cursor->info = info;
+ cursor->ref = 1;
+
+ spin_lock_inode(inode);
+ /* install cursor as @f's private_data, discarding old
+ * one if necessary */
+#if REISER4_DEBUG
+ if (file->private_data)
+ warning("", "file has fsdata already");
+#endif
+ clean_fsdata(file);
+ free_file_fsdata_nolock(file);
+ file->private_data = fsdata;
+ fsdata->cursor = cursor;
+ spin_unlock_inode(inode);
+ spin_lock(&d_c_lock);
+ /* insert cursor into hash table */
+ d_cursor_hash_insert(&info->table, cursor);
+ /* and chain it into radix-tree */
+ bind_cursor(cursor, (unsigned long)oid);
+ spin_unlock(&d_c_lock);
+ radix_tree_preload_end();
+ *fpos = ((__u64) cursor->key.cid) << CID_SHIFT;
+ }
+ } else
+ result = RETERR(-ENOMEM);
+ return result;
+}
+
+/**
+ * process_cursors - do action on each cursor attached to inode
+ * @inode:
+ * @act: action to do
+ *
+ * Finds all cursors of @inode in reiser4's super block radix tree of cursors
+ * and performs action specified by @act on each of cursors.
+ */
+static void process_cursors(struct inode *inode, enum cursor_action act)
+{
+ oid_t oid;
+ dir_cursor *start;
+ struct list_head *head;
+ reiser4_context *ctx;
+ struct d_cursor_info *info;
+
+ /* this can be called by
+ *
+ * kswapd->...->prune_icache->..reiser4_destroy_inode
+ *
+ * without reiser4_context
+ */
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ warning("vs-23", "failed to init context");
+ return;
+ }
+
+ assert("nikita-3558", inode != NULL);
+
+ info = d_info(inode);
+ oid = get_inode_oid(inode);
+ spin_lock_inode(inode);
+ head = get_readdir_list(inode);
+ spin_lock(&d_c_lock);
+ /* find any cursor for this oid: reference to it is hanging of radix
+ * tree */
+ start = lookup(info, (unsigned long)oid);
+ if (start != NULL) {
+ dir_cursor *scan;
+ reiser4_file_fsdata *fsdata;
+
+ /* process circular list of cursors for this oid */
+ scan = start;
+ do {
+ dir_cursor *next;
+
+ next = list_entry(scan->list.next, dir_cursor, list);
+ fsdata = scan->fsdata;
+ assert("nikita-3557", fsdata != NULL);
+ if (scan->key.oid == oid) {
+ switch (act) {
+ case CURSOR_DISPOSE:
+ list_del_init(&fsdata->dir.linkage);
+ break;
+ case CURSOR_LOAD:
+ list_add(&fsdata->dir.linkage, head);
+ break;
+ case CURSOR_KILL:
+ kill_cursor(scan);
+ break;
+ }
+ }
+ if (scan == next)
+ /* last cursor was just killed */
+ break;
+ scan = next;
+ } while (scan != start);
+ }
+ spin_unlock(&d_c_lock);
+ /* check that we killed 'em all */
+ assert("nikita-3568",
+ ergo(act == CURSOR_KILL,
+ list_empty_careful(get_readdir_list(inode))));
+ assert("nikita-3569",
+ ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
+ spin_unlock_inode(inode);
+ reiser4_exit_context(ctx);
+}
+
+/**
+ * reiser4_dispose_cursors - removes cursors from inode's list
+ * @inode: inode to dispose cursors of
+ *
+ * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
+ * attached to cursor from inode's readdir list. This is called when inode is
+ * removed from the memory by memory pressure.
+ */
+void reiser4_dispose_cursors(struct inode *inode)
+{
+ process_cursors(inode, CURSOR_DISPOSE);
+}
+
+/**
+ * reiser4_load_cursors - attach cursors to inode
+ * @inode: inode to load cursors to
+ *
+ * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
+ * attached to cursor to inode's readdir list. This is done when inode is
+ * loaded into memory.
+ */
+void reiser4_load_cursors(struct inode *inode)
+{
+ process_cursors(inode, CURSOR_LOAD);
+}
+
+/**
+ * reiser4_kill_cursors - kill all inode cursors
+ * @inode: inode to kill cursors of
+ *
+ * Frees all cursors for this inode. This is called when inode is destroyed.
+ */
+void reiser4_kill_cursors(struct inode *inode)
+{
+ process_cursors(inode, CURSOR_KILL);
+}
+
+/**
+ * file_is_stateless -
+ * @file:
+ *
+ * true, if file descriptor @f is created by NFS server by "demand" to serve
+ * one file system operation. This means that there may be "detached state"
+ * for underlying inode.
+ */
+static int file_is_stateless(struct file *file)
+{
+ return reiser4_get_dentry_fsdata(file->f_path.dentry)->stateless;
+}
+
+/**
+ * reiser4_get_dir_fpos -
+ * @dir:
+ * @fpos: effective value of dir->f_pos
+ *
+ * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
+ * in the case of stateless directory operation (readdir-over-nfs), client id
+ * was encoded in the high bits of cookie and should me masked off.
+ */
+loff_t reiser4_get_dir_fpos(struct file *dir, loff_t fpos)
+{
+ if (file_is_stateless(dir))
+ return fpos & CID_MASK;
+ else
+ return fpos;
+}
+
+/**
+ * reiser4_attach_fsdata - try to attach fsdata
+ * @file:
+ * @fpos: effective value of @file->f_pos
+ * @inode:
+ *
+ * Finds or creates cursor for readdir-over-nfs.
+ */
+int reiser4_attach_fsdata(struct file *file, loff_t *fpos, struct inode *inode)
+{
+ loff_t pos;
+ int result;
+ dir_cursor *cursor;
+
+ /*
+ * we are serialized by inode->i_mutex
+ */
+ if (!file_is_stateless(file))
+ return 0;
+
+ pos = *fpos;
+ result = 0;
+ if (pos == 0) {
+ /*
+ * first call to readdir (or rewind to the beginning of
+ * directory)
+ */
+ cursor = kmem_cache_alloc(d_cursor_cache,
+ reiser4_ctx_gfp_mask_get());
+ if (cursor != NULL)
+ result = insert_cursor(cursor, file, fpos, inode);
+ else
+ result = RETERR(-ENOMEM);
+ } else {
+ /* try to find existing cursor */
+ struct d_cursor_key key;
+
+ key.cid = pos >> CID_SHIFT;
+ key.oid = get_inode_oid(inode);
+ spin_lock(&d_c_lock);
+ cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
+ if (cursor != NULL) {
+ /* cursor was found */
+ if (cursor->ref == 0) {
+ /* move it from unused list */
+ list_del_init(&cursor->alist);
+ --d_cursor_unused;
+ }
+ ++cursor->ref;
+ }
+ spin_unlock(&d_c_lock);
+ if (cursor != NULL) {
+ spin_lock_inode(inode);
+ assert("nikita-3556", cursor->fsdata->back == NULL);
+ clean_fsdata(file);
+ free_file_fsdata_nolock(file);
+ file->private_data = cursor->fsdata;
+ spin_unlock_inode(inode);
+ }
+ }
+ return result;
+}
+
+/**
+ * reiser4_detach_fsdata - ???
+ * @file:
+ *
+ * detach fsdata, if necessary
+ */
+void reiser4_detach_fsdata(struct file *file)
+{
+ struct inode *inode;
+
+ if (!file_is_stateless(file))
+ return;
+
+ inode = file_inode(file);
+ spin_lock_inode(inode);
+ clean_fsdata(file);
+ spin_unlock_inode(inode);
+}
+
+/* slab for reiser4_dentry_fsdata */
+static struct kmem_cache *dentry_fsdata_cache;
+
+/**
+ * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
+ *
+ * Initializes slab cache of structures attached to denty->d_fsdata. It is
+ * part of reiser4 module initialization.
+ */
+int reiser4_init_dentry_fsdata(void)
+{
+ dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
+ sizeof(struct reiser4_dentry_fsdata),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT,
+ NULL);
+ if (dentry_fsdata_cache == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+/**
+ * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_dentry_fsdata(void)
+{
+ destroy_reiser4_cache(&dentry_fsdata_cache);
+}
+
+/**
+ * reiser4_get_dentry_fsdata - get fs-specific dentry data
+ * @dentry: queried dentry
+ *
+ * Allocates if necessary and returns per-dentry data that we attach to each
+ * dentry.
+ */
+struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
+{
+ assert("nikita-1365", dentry != NULL);
+
+ if (dentry->d_fsdata == NULL) {
+ dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
+ reiser4_ctx_gfp_mask_get());
+ if (dentry->d_fsdata == NULL)
+ return ERR_PTR(RETERR(-ENOMEM));
+ memset(dentry->d_fsdata, 0,
+ sizeof(struct reiser4_dentry_fsdata));
+ }
+ return dentry->d_fsdata;
+}
+
+/**
+ * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
+ * @dentry: dentry to free fsdata of
+ *
+ * Detaches and frees fs-specific dentry data
+ */
+void reiser4_free_dentry_fsdata(struct dentry *dentry)
+{
+ if (dentry->d_fsdata != NULL) {
+ kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
+ dentry->d_fsdata = NULL;
+ }
+}
+
+/* slab for reiser4_file_fsdata */
+static struct kmem_cache *file_fsdata_cache;
+
+/**
+ * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
+ *
+ * Initializes slab cache of structures attached to file->private_data. It is
+ * part of reiser4 module initialization.
+ */
+int reiser4_init_file_fsdata(void)
+{
+ file_fsdata_cache = kmem_cache_create("file_fsdata",
+ sizeof(reiser4_file_fsdata),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (file_fsdata_cache == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+/**
+ * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void reiser4_done_file_fsdata(void)
+{
+ destroy_reiser4_cache(&file_fsdata_cache);
+}
+
+/**
+ * create_fsdata - allocate and initialize reiser4_file_fsdata
+ * @file: what to create file_fsdata for, may be NULL
+ *
+ * Allocates and initializes reiser4_file_fsdata structure.
+ */
+static reiser4_file_fsdata *create_fsdata(struct file *file)
+{
+ reiser4_file_fsdata *fsdata;
+
+ fsdata = kmem_cache_alloc(file_fsdata_cache,
+ reiser4_ctx_gfp_mask_get());
+ if (fsdata != NULL) {
+ memset(fsdata, 0, sizeof *fsdata);
+ fsdata->back = file;
+ INIT_LIST_HEAD(&fsdata->dir.linkage);
+ }
+ return fsdata;
+}
+
+/**
+ * free_fsdata - free reiser4_file_fsdata
+ * @fsdata: object to free
+ *
+ * Dual to create_fsdata(). Free reiser4_file_fsdata.
+ */
+static void free_fsdata(reiser4_file_fsdata *fsdata)
+{
+ BUG_ON(fsdata == NULL);
+ kmem_cache_free(file_fsdata_cache, fsdata);
+}
+
+/**
+ * reiser4_get_file_fsdata - get fs-specific file data
+ * @file: queried file
+ *
+ * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
+ * to @file.
+ */
+reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
+{
+ assert("nikita-1603", file != NULL);
+
+ if (file->private_data == NULL) {
+ reiser4_file_fsdata *fsdata;
+ struct inode *inode;
+
+ fsdata = create_fsdata(file);
+ if (fsdata == NULL)
+ return ERR_PTR(RETERR(-ENOMEM));
+
+ inode = file_inode(file);
+ spin_lock_inode(inode);
+ if (file->private_data == NULL) {
+ file->private_data = fsdata;
+ fsdata = NULL;
+ }
+ spin_unlock_inode(inode);
+ if (fsdata != NULL)
+ /* other thread initialized ->fsdata */
+ kmem_cache_free(file_fsdata_cache, fsdata);
+ }
+ assert("nikita-2665", file->private_data != NULL);
+ return file->private_data;
+}
+
+/**
+ * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
+ * @file:
+ *
+ * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
+ * readdir list, frees if it is not linked to d_cursor object.
+ */
+static void free_file_fsdata_nolock(struct file *file)
+{
+ reiser4_file_fsdata *fsdata;
+
+ assert("", spin_inode_is_locked(file_inode(file)));
+ fsdata = file->private_data;
+ if (fsdata != NULL) {
+ list_del_init(&fsdata->dir.linkage);
+ if (fsdata->cursor == NULL)
+ free_fsdata(fsdata);
+ }
+ file->private_data = NULL;
+}
+
+/**
+ * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
+ * @file:
+ *
+ * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
+ */
+void reiser4_free_file_fsdata(struct file *file)
+{
+ spin_lock_inode(file_inode(file));
+ free_file_fsdata_nolock(file);
+ spin_unlock_inode(file_inode(file));
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/fsdata.h linux-5.10.2/fs/reiser4/fsdata.h
--- linux-5.10.2.orig/fs/reiser4/fsdata.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/fsdata.h 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,203 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#if !defined(__REISER4_FSDATA_H__)
+#define __REISER4_FSDATA_H__
+
+#include "debug.h"
+#include "kassign.h"
+#include "seal.h"
+#include "type_safe_hash.h"
+#include "plugin/file/file.h"
+#include "readahead.h"
+
+/*
+ * comment about reiser4_dentry_fsdata
+ *
+ *
+ */
+
+/*
+ * locking: fields of per file descriptor readdir_pos and ->f_pos are
+ * protected by ->i_mutex on inode. Under this lock following invariant
+ * holds:
+ *
+ * file descriptor is "looking" at the entry_no-th directory entry from
+ * the beginning of directory. This entry has key dir_entry_key and is
+ * pos-th entry with duplicate-key sequence.
+ *
+ */
+
+/* logical position within directory */
+struct dir_pos {
+ /* key of directory entry (actually, part of a key sufficient to
+ identify directory entry) */
+ de_id dir_entry_key;
+ /* ordinal number of directory entry among all entries with the same
+ key. (Starting from 0.) */
+ unsigned pos;
+};
+
+struct readdir_pos {
+ /* f_pos corresponding to this readdir position */
+ __u64 fpos;
+ /* logical position within directory */
+ struct dir_pos position;
+ /* logical number of directory entry within
+ directory */
+ __u64 entry_no;
+};
+
+/*
+ * this is used to speed up lookups for directory entry: on initial call to
+ * ->lookup() seal and coord of directory entry (if found, that is) are stored
+ * in struct dentry and reused later to avoid tree traversals.
+ */
+struct de_location {
+ /* seal covering directory entry */
+ seal_t entry_seal;
+ /* coord of directory entry */
+ coord_t entry_coord;
+ /* ordinal number of directory entry among all entries with the same
+ key. (Starting from 0.) */
+ int pos;
+};
+
+/**
+ * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
+ *
+ * This is allocated dynamically and released in d_op->d_release()
+ *
+ * Currently it only contains cached location (hint) of directory entry, but
+ * it is expected that other information will be accumulated here.
+ */
+struct reiser4_dentry_fsdata {
+ /*
+ * here will go fields filled by ->lookup() to speedup next
+ * create/unlink, like blocknr of znode with stat-data, or key of
+ * stat-data.
+ */
+ struct de_location dec;
+ int stateless; /* created through reiser4_decode_fh, needs
+ * special treatment in readdir. */
+};
+
+extern int reiser4_init_dentry_fsdata(void);
+extern void reiser4_done_dentry_fsdata(void);
+extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
+extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
+
+/**
+ * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
+ *
+ * This is allocated dynamically and released in inode->i_fop->release
+ */
+typedef struct reiser4_file_fsdata {
+ /*
+ * pointer back to the struct file which this reiser4_file_fsdata is
+ * part of
+ */
+ struct file *back;
+ /* detached cursor for stateless readdir. */
+ struct dir_cursor *cursor;
+ /*
+ * We need both directory and regular file parts here, because there
+ * are file system objects that are files and directories.
+ */
+ struct {
+ /*
+ * position in directory. It is updated each time directory is
+ * modified
+ */
+ struct readdir_pos readdir;
+ /* head of this list is reiser4_inode->lists.readdir_list */
+ struct list_head linkage;
+ } dir;
+ /* hints to speed up operations with regular files: read and write. */
+ struct {
+ hint_t hint;
+ } reg;
+} reiser4_file_fsdata;
+
+extern int reiser4_init_file_fsdata(void);
+extern void reiser4_done_file_fsdata(void);
+extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
+extern void reiser4_free_file_fsdata(struct file *);
+
+/*
+ * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
+ * used to address problem reiser4 has with readdir accesses via NFS. See
+ * plugin/file_ops_readdir.c for more details.
+ */
+struct d_cursor_key{
+ __u16 cid;
+ __u64 oid;
+};
+
+/*
+ * define structures d_cursor_hash_table d_cursor_hash_link which are used to
+ * maintain hash table of dir_cursor-s in reiser4's super block
+ */
+typedef struct dir_cursor dir_cursor;
+TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
+
+struct dir_cursor {
+ int ref;
+ reiser4_file_fsdata *fsdata;
+
+ /* link to reiser4 super block hash table of cursors */
+ d_cursor_hash_link hash;
+
+ /*
+ * this is to link cursors to reiser4 super block's radix tree of
+ * cursors if there are more than one cursor of the same objectid
+ */
+ struct list_head list;
+ struct d_cursor_key key;
+ struct d_cursor_info *info;
+ /* list of unused cursors */
+ struct list_head alist;
+};
+
+extern int reiser4_init_d_cursor(void);
+extern void reiser4_done_d_cursor(void);
+
+extern int reiser4_init_super_d_info(struct super_block *);
+extern void reiser4_done_super_d_info(struct super_block *);
+
+extern loff_t reiser4_get_dir_fpos(struct file *, loff_t);
+extern int reiser4_attach_fsdata(struct file *, loff_t *, struct inode *);
+extern void reiser4_detach_fsdata(struct file *);
+
+/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
+ more details */
+void reiser4_dispose_cursors(struct inode *inode);
+void reiser4_load_cursors(struct inode *inode);
+void reiser4_kill_cursors(struct inode *inode);
+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
+ int offset, int adj);
+
+/*
+ * this structure is embedded to reise4_super_info_data. It maintains d_cursors
+ * (detached readdir state). See plugin/file_ops_readdir.c for more details.
+ */
+struct d_cursor_info {
+ d_cursor_hash_table table;
+ struct radix_tree_root tree;
+};
+
+/* spinlock protecting readdir cursors */
+extern spinlock_t d_c_lock;
+
+/* __REISER4_FSDATA_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/init_super.c linux-5.10.2/fs/reiser4/init_super.c
--- linux-5.10.2.orig/fs/reiser4/init_super.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/init_super.c 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,805 @@
+/* Copyright by Hans Reiser, 2003 */
+
+#include "super.h"
+#include "inode.h"
+#include "plugin/plugin_set.h"
+
+#include <linux/swap.h>
+
+/**
+ * init_fs_info - allocate reiser4 specific super block
+ * @super: super block of filesystem
+ *
+ * Allocates and initialize reiser4_super_info_data, attaches it to
+ * super->s_fs_info, initializes structures maintaining d_cursor-s.
+ */
+int reiser4_init_fs_info(struct super_block *super)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = kzalloc(sizeof(reiser4_super_info_data),
+ reiser4_ctx_gfp_mask_get());
+ if (!sbinfo)
+ return RETERR(-ENOMEM);
+
+ super->s_fs_info = sbinfo;
+ super->s_op = NULL;
+
+ ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
+ ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
+
+ mutex_init(&sbinfo->delete_mutex);
+ spin_lock_init(&(sbinfo->guard));
+
+ /* initialize per-super-block d_cursor resources */
+ reiser4_init_super_d_info(super);
+
+ /* initialize global tree lock */
+ rwlock_init(&(sbinfo->tree_lock));
+
+ return 0;
+}
+
+/**
+ * Release reiser4 specific super block
+ *
+ * release per-super-block d_cursor resources
+ * free reiser4_super_info_data
+ */
+void reiser4_done_fs_info(struct super_block *super)
+{
+ assert("zam-990", super->s_fs_info != NULL);
+
+ reiser4_done_super_d_info(super);
+ kfree(super->s_fs_info);
+ super->s_fs_info = NULL;
+ super->s_op = NULL;
+}
+
+/* type of option parseable by parse_option() */
+typedef enum {
+ /* value of option is arbitrary string */
+ OPT_STRING,
+
+ /*
+ * option specifies bit in a bitmask. When option is set - bit in
+ * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
+ * dont_load_bitmap, atomic_write.
+ */
+ OPT_BIT,
+
+ /*
+ * value of option should conform to sprintf() format. Examples are
+ * tmgr.atom_max_size=N, tmgr.atom_max_age=N
+ */
+ OPT_FORMAT,
+
+ /*
+ * option can take one of predefined values. Example is onerror=panic or
+ * onerror=remount-ro
+ */
+ OPT_ONEOF,
+
+ /*
+ * option take one of txmod plugin labels.
+ * Example is "txmod=journal" or "txmod=wa"
+ */
+ OPT_TXMOD,
+} opt_type_t;
+
+#if 0
+struct opt_bitmask_bit {
+ const char *bit_name;
+ int bit_nr;
+};
+#endif
+
+#define MAX_ONEOF_LIST 10
+
+/* description of option parseable by parse_option() */
+struct opt_desc {
+ /* option name.
+
+ parsed portion of string has a form "name=value".
+ */
+ const char *name;
+ /* type of option */
+ opt_type_t type;
+ union {
+ /* where to store value of string option (type == OPT_STRING) */
+ char **string;
+ /* description of bits for bit option (type == OPT_BIT) */
+ struct {
+ int nr;
+ void *addr;
+ } bit;
+ /* description of format and targets for format option (type
+ == OPT_FORMAT) */
+ struct {
+ const char *format;
+ int nr_args;
+ void *arg1;
+ void *arg2;
+ void *arg3;
+ void *arg4;
+ } f;
+ struct {
+ int *result;
+ const char *list[MAX_ONEOF_LIST];
+ } oneof;
+ struct {
+ reiser4_txmod_id *result;
+ } txmod;
+ struct {
+ void *addr;
+ int nr_bits;
+ /* struct opt_bitmask_bit *bits; */
+ } bitmask;
+ } u;
+};
+
+/**
+ * parse_option - parse one option
+ * @opt_strin: starting point of parsing
+ * @opt: option description
+ *
+ * foo=bar,
+ * ^ ^ ^
+ * | | +-- replaced to '\0'
+ * | +-- val_start
+ * +-- opt_string
+ * Figures out option type and handles option correspondingly.
+ */
+static int parse_option(char *opt_string, struct opt_desc *opt)
+{
+ char *val_start;
+ int result;
+ const char *err_msg;
+
+ /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
+
+ val_start = strchr(opt_string, '=');
+ if (val_start != NULL) {
+ *val_start = '\0';
+ ++val_start;
+ }
+
+ err_msg = NULL;
+ result = 0;
+ switch (opt->type) {
+ case OPT_STRING:
+ if (val_start == NULL) {
+ err_msg = "String arg missing";
+ result = RETERR(-EINVAL);
+ } else
+ *opt->u.string = val_start;
+ break;
+ case OPT_BIT:
+ if (val_start != NULL)
+ err_msg = "Value ignored";
+ else
+ set_bit(opt->u.bit.nr, opt->u.bit.addr);
+ break;
+ case OPT_FORMAT:
+ if (val_start == NULL) {
+ err_msg = "Formatted arg missing";
+ result = RETERR(-EINVAL);
+ break;
+ }
+ if (sscanf(val_start, opt->u.f.format,
+ opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
+ opt->u.f.arg4) != opt->u.f.nr_args) {
+ err_msg = "Wrong conversion";
+ result = RETERR(-EINVAL);
+ }
+ break;
+ case OPT_ONEOF:
+ {
+ int i = 0;
+
+ if (val_start == NULL) {
+ err_msg = "Value is missing";
+ result = RETERR(-EINVAL);
+ break;
+ }
+ err_msg = "Wrong option value";
+ result = RETERR(-EINVAL);
+ while (opt->u.oneof.list[i]) {
+ if (!strcmp(opt->u.oneof.list[i], val_start)) {
+ result = 0;
+ err_msg = NULL;
+ *opt->u.oneof.result = i;
+ break;
+ }
+ i++;
+ }
+ break;
+ }
+ break;
+ case OPT_TXMOD:
+ {
+ reiser4_txmod_id i = 0;
+
+ if (val_start == NULL) {
+ err_msg = "Value is missing";
+ result = RETERR(-EINVAL);
+ break;
+ }
+ err_msg = "Wrong option value";
+ result = RETERR(-EINVAL);
+ while (i < LAST_TXMOD_ID) {
+ if (!strcmp(txmod_plugins[i].h.label,
+ val_start)) {
+ result = 0;
+ err_msg = NULL;
+ *opt->u.txmod.result = i;
+ break;
+ }
+ i++;
+ }
+ break;
+ }
+ default:
+ wrong_return_value("nikita-2100", "opt -> type");
+ break;
+ }
+ if (err_msg != NULL) {
+ warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
+ err_msg, opt->name, val_start ? "=" : "",
+ val_start ? : "");
+ }
+ return result;
+}
+
+/**
+ * parse_options - parse reiser4 mount options
+ * @opt_string: starting point
+ * @opts: array of option description
+ * @nr_opts: number of elements in @opts
+ *
+ * Parses comma separated list of reiser4 mount options.
+ */
+static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
+{
+ int result;
+
+ result = 0;
+ while ((result == 0) && opt_string && *opt_string) {
+ int j;
+ char *next;
+
+ next = strchr(opt_string, ',');
+ if (next != NULL) {
+ *next = '\0';
+ ++next;
+ }
+ for (j = 0; j < nr_opts; ++j) {
+ if (!strncmp(opt_string, opts[j].name,
+ strlen(opts[j].name))) {
+ result = parse_option(opt_string, &opts[j]);
+ break;
+ }
+ }
+ if (j == nr_opts) {
+ warning("nikita-2307", "Unrecognized option: \"%s\"",
+ opt_string);
+ /* traditionally, -EINVAL is returned on wrong mount
+ option */
+ result = RETERR(-EINVAL);
+ }
+ opt_string = next;
+ }
+ return result;
+}
+
+#define NUM_OPT(label, fmt, addr) \
+ { \
+ .name = (label), \
+ .type = OPT_FORMAT, \
+ .u = { \
+ .f = { \
+ .format = (fmt), \
+ .nr_args = 1, \
+ .arg1 = (addr), \
+ .arg2 = NULL, \
+ .arg3 = NULL, \
+ .arg4 = NULL \
+ } \
+ } \
+ }
+
+#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field)
+
+#define BIT_OPT(label, bitnr) \
+ { \
+ .name = label, \
+ .type = OPT_BIT, \
+ .u = { \
+ .bit = { \
+ .nr = bitnr, \
+ .addr = &sbinfo->fs_flags \
+ } \
+ } \
+ }
+
+#define MAX_NR_OPTIONS (30)
+
+#if REISER4_DEBUG
+# define OPT_ARRAY_CHECK(opt, array) \
+ if ((opt) > (array) + MAX_NR_OPTIONS) { \
+ warning("zam-1046", "opt array is overloaded"); break; \
+ }
+#else
+# define OPT_ARRAY_CHECK(opt, array) noop
+#endif
+
+#define PUSH_OPT(opt, array, ...) \
+do { \
+ struct opt_desc o = __VA_ARGS__; \
+ OPT_ARRAY_CHECK(opt, array); \
+ *(opt) ++ = o; \
+} while (0)
+
+static noinline void push_sb_field_opts(struct opt_desc **p,
+ struct opt_desc *opts,
+ reiser4_super_info_data *sbinfo)
+{
+#define PUSH_SB_FIELD_OPT(field, format) \
+ PUSH_OPT(*p, opts, SB_FIELD_OPT(field, format))
+ /*
+ * tmgr.atom_max_size=N
+ * Atoms containing more than N blocks will be forced to commit. N is
+ * decimal.
+ */
+ PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
+ /*
+ * tmgr.atom_max_age=N
+ * Atoms older than N seconds will be forced to commit. N is decimal.
+ */
+ PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
+ /*
+ * tmgr.atom_min_size=N
+ * In committing an atom to free dirty pages, force the atom less than
+ * N in size to fuse with another one.
+ */
+ PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
+ /*
+ * tmgr.atom_max_flushers=N
+ * limit of concurrent flushers for one atom. 0 means no limit.
+ */
+ PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
+ /*
+ * tree.cbk_cache_slots=N
+ * Number of slots in the cbk cache.
+ */
+ //PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
+ /*
+ * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
+ * leaf-level blocks it will force them to be relocated.
+ */
+ //PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
+ /*
+ * If flush finds can find a block allocation closer than at most
+ * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
+ * position.
+ */
+ //PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
+ /*
+ * If we have written this much or more blocks before encountering busy
+ * jnode in flush list - abort flushing hoping that next time we get
+ * called this jnode will be clean already, and we will save some
+ * seeks.
+ */
+ //PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
+ /* The maximum number of nodes to scan left on a level during flush. */
+ //PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
+ /* preferred IO size */
+ PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
+ /* carry flags used for insertion of new nodes */
+ //PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
+ /* carry flags used for insertion of new extents */
+ //PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
+ /* carry flags used for paste operations */
+ //PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
+ /* carry flags used for insert operations */
+ //PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
+
+#ifdef CONFIG_REISER4_BADBLOCKS
+ /*
+ * Alternative master superblock location in case if it's original
+ * location is not writeable/accessable. This is offset in BYTES.
+ */
+ PUSH_SB_FIELD_OPT(altsuper, "%lu");
+#endif
+}
+
+/**
+ * reiser4_init_super_data - initialize reiser4 private super block
+ * @super: super block to initialize
+ * @opt_string: list of reiser4 mount options
+ *
+ * Sets various reiser4 parameters to default values. Parses mount options and
+ * overwrites default settings.
+ */
+int reiser4_init_super_data(struct super_block *super, char *opt_string)
+{
+ int result;
+ struct opt_desc *opts, *p;
+ reiser4_super_info_data *sbinfo = get_super_private(super);
+
+ /* initialize super, export, dentry operations */
+ sbinfo->ops.super = reiser4_super_operations;
+ sbinfo->ops.export = reiser4_export_operations;
+ sbinfo->ops.dentry = reiser4_dentry_operations;
+ super->s_op = &sbinfo->ops.super;
+ super->s_export_op = &sbinfo->ops.export;
+
+ /* initialize transaction manager parameters to default values */
+ sbinfo->tmgr.atom_max_size = totalram_pages() / 4;
+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
+ sbinfo->tmgr.atom_min_size = 256;
+ sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
+
+ sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
+
+ /* initialize default readahead params */
+ sbinfo->ra_params.max = totalram_pages() / 4;
+ sbinfo->ra_params.flags = 0;
+
+ /* hard links for directories are not supported */
+ sbinfo->fs_flags |= (1 << REISER4_ADG);
+
+ /* allocate memory for structure describing reiser4 mount options */
+ opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
+ reiser4_ctx_gfp_mask_get());
+ if (opts == NULL)
+ return RETERR(-ENOMEM);
+
+ /* initialize structure describing reiser4 mount options */
+ p = opts;
+
+ push_sb_field_opts(&p, opts, sbinfo);
+ /* turn on BSD-style gid assignment */
+
+#define PUSH_BIT_OPT(name, bit) \
+ PUSH_OPT(p, opts, BIT_OPT(name, bit))
+
+ PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
+ /* turn on 32 bit times */
+ PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
+ /*
+ * Don't load all bitmap blocks at mount time, it is useful for
+ * machines with tiny RAM and large disks.
+ */
+ PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
+ /* disable transaction commits during write() */
+ PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
+ /* enable issuing of discard requests */
+ PUSH_BIT_OPT("discard", REISER4_DISCARD);
+ /* disable hole punching at flush time */
+ PUSH_BIT_OPT("dont_punch_holes", REISER4_DONT_PUNCH_HOLES);
+
+ PUSH_OPT(p, opts,
+ {
+ /*
+ * tree traversal readahead parameters:
+ * -o readahead:MAXNUM:FLAGS
+ * MAXNUM - max number fo nodes to request readahead for: -1UL
+ * will set it to max_sane_readahead()
+ * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
+ * CONTINUE_ON_PRESENT
+ */
+ .name = "readahead",
+ .type = OPT_FORMAT,
+ .u = {
+ .f = {
+ .format = "%u:%u",
+ .nr_args = 2,
+ .arg1 = &sbinfo->ra_params.max,
+ .arg2 = &sbinfo->ra_params.flags,
+ .arg3 = NULL,
+ .arg4 = NULL
+ }
+ }
+ }
+ );
+
+ /* What to do in case of fs error */
+ PUSH_OPT(p, opts,
+ {
+ .name = "onerror",
+ .type = OPT_ONEOF,
+ .u = {
+ .oneof = {
+ .result = &sbinfo->onerror,
+ .list = {
+ "remount-ro", "panic", NULL
+ },
+ }
+ }
+ }
+ );
+#if 0
+ /*
+ * What trancaction model (journal, cow, etc)
+ * is used to commit transactions
+ */
+ PUSH_OPT(p, opts,
+ {
+ .name = "txmod",
+ .type = OPT_TXMOD,
+ .u = {
+ .txmod = {
+ .result = &sbinfo->txmod
+ }
+ }
+ }
+ );
+#endif
+ /* modify default settings to values set by mount options */
+ result = parse_options(opt_string, opts, p - opts);
+ kfree(opts);
+ if (result != 0)
+ return result;
+
+ /* correct settings to sanity values */
+ sbinfo->tmgr.atom_max_age *= HZ;
+ if (sbinfo->tmgr.atom_max_age <= 0)
+ /* overflow */
+ sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
+
+ /* round optimal io size up to 512 bytes */
+ sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
+ sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
+ if (sbinfo->optimal_io_size == 0) {
+ warning("nikita-2497", "optimal_io_size is too small");
+ return RETERR(-EINVAL);
+ }
+ return result;
+}
+
+/**
+ * reiser4_read_master - read reiser4 master super block
+ * @super: super block to fill
+ * @silent: if 0 - print warnings
+ *
+ * Reads reiser4 master super block either from predefined location or from
+ * location specified by altsuper mount option, set blocksize to super-block.
+ */
+int reiser4_read_master(struct super_block *super, int silent, u8 *vol_uuid)
+{
+ struct reiser4_volume *vol;
+ struct buffer_head *master_bh;
+ struct reiser4_master_sb *master_sb;
+ reiser4_super_info_data *sbinfo;
+ unsigned long blocksize;
+
+ sbinfo = get_super_private(super);
+
+ read_super_block:
+
+#ifdef CONFIG_REISER4_BADBLOCKS
+ if (sbinfo->altsuper)
+ /*
+ * read reiser4 master super block at position specified by
+ * mount option
+ */
+ master_bh = sb_bread(super,
+ (sector_t)(sbinfo->altsuper / super->s_blocksize));
+ else
+#endif
+ /*
+ * read reiser4 master super block at 16-th 4096 block
+ */
+ master_bh = sb_bread(super,
+ (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
+ if (!master_bh)
+ return RETERR(-EIO);
+
+ master_sb = (struct reiser4_master_sb *)master_bh->b_data;
+ /*
+ * check reiser4 magic string
+ */
+ if (!strncmp(master_sb->magic,
+ REISER4_SUPER_MAGIC_STRING,
+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
+ /*
+ * reiser4 master super block contains filesystem blocksize
+ */
+ blocksize = master_get_block_size(master_sb);
+
+ if (blocksize != PAGE_SIZE) {
+ /*
+ * currenly reiser4's blocksize must be equal to
+ * pagesize
+ */
+ if (!silent)
+ warning("nikita-2609",
+ "%s: wrong block size %ld\n",
+ super->s_id,
+ blocksize);
+ brelse(master_bh);
+ return RETERR(-EINVAL);
+ }
+ if (blocksize != super->s_blocksize) {
+ /*
+ * filesystem uses different blocksize. Reread master
+ * super block with correct blocksize
+ */
+ brelse(master_bh);
+ if (!sb_set_blocksize(super, (int)blocksize))
+ return RETERR(-EINVAL);
+ goto read_super_block;
+ }
+ /*
+ * there should be a respective registered volume in the system
+ */
+ vol = reiser4_search_volume(master_sb->uuid);
+ if (!vol) {
+ warning("edward-1737",
+ "%s: volume is not registered", super->s_id);
+ goto error;
+ }
+ memcpy(vol_uuid, master_sb->uuid, 16);
+ brelse(master_bh);
+ return 0;
+ }
+ /* there is no reiser4 on the device */
+ if (!silent)
+ warning("nikita-2608",
+ "%s: wrong master super block magic", super->s_id);
+ error:
+ brelse(master_bh);
+ return RETERR(-EINVAL);
+}
+
+static struct {
+ reiser4_plugin_type type;
+ reiser4_plugin_id id;
+} default_plugins[PSET_LAST] = {
+ [PSET_FILE] = {
+ .type = REISER4_FILE_PLUGIN_TYPE,
+ .id = UNIX_FILE_PLUGIN_ID
+ },
+ [PSET_DIR] = {
+ .type = REISER4_DIR_PLUGIN_TYPE,
+ .id = HASHED_DIR_PLUGIN_ID
+ },
+ [PSET_HASH] = {
+ .type = REISER4_HASH_PLUGIN_TYPE,
+ .id = R5_HASH_ID
+ },
+ [PSET_FIBRATION] = {
+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
+ .id = FIBRATION_DOT_O
+ },
+ [PSET_PERM] = {
+ .type = REISER4_PERM_PLUGIN_TYPE,
+ .id = NULL_PERM_ID
+ },
+ [PSET_FORMATTING] = {
+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
+ .id = SMALL_FILE_FORMATTING_ID
+ },
+ [PSET_SD] = {
+ .type = REISER4_ITEM_PLUGIN_TYPE,
+ .id = STATIC_STAT_DATA_ID
+ },
+ [PSET_DIR_ITEM] = {
+ .type = REISER4_ITEM_PLUGIN_TYPE,
+ .id = COMPOUND_DIR_ID
+ },
+ [PSET_CIPHER] = {
+ .type = REISER4_CIPHER_PLUGIN_TYPE,
+ .id = NONE_CIPHER_ID
+ },
+ [PSET_DIGEST] = {
+ .type = REISER4_DIGEST_PLUGIN_TYPE,
+ .id = SHA256_32_DIGEST_ID
+ },
+ [PSET_COMPRESSION] = {
+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .id = LZO1_COMPRESSION_ID
+ },
+ [PSET_COMPRESSION_MODE] = {
+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = CONVX_COMPRESSION_MODE_ID
+ },
+ [PSET_CLUSTER] = {
+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
+ .id = CLUSTER_64K_ID
+ },
+ [PSET_CREATE] = {
+ .type = REISER4_FILE_PLUGIN_TYPE,
+ .id = UNIX_FILE_PLUGIN_ID
+ }
+};
+
+/* access to default plugin table */
+reiser4_plugin *get_default_plugin(pset_member memb)
+{
+ return plugin_by_id(default_plugins[memb].type,
+ default_plugins[memb].id);
+}
+
+/**
+ * reiser4_init_root_inode - obtain inode of root directory
+ * @super: super block of filesystem
+ *
+ * Obtains inode of root directory (reading it from disk), initializes plugin
+ * set it was not initialized.
+ */
+int reiser4_init_root_inode(struct super_block *super)
+{
+ int result = 0;
+ struct inode *inode;
+ reiser4_super_info_data *sbinfo = get_super_private(super);
+ reiser4_subvol *root_subv = get_meta_subvol();
+
+ inode = reiser4_iget(super,
+ root_subv->df_plug->root_dir_key(super),
+ FIND_EXACT, 0);
+ if (IS_ERR(inode))
+ return RETERR(PTR_ERR(inode));
+
+ super->s_root = d_make_root(inode);
+ if (!super->s_root) {
+ return RETERR(-ENOMEM);
+ }
+
+ super->s_root->d_op = &sbinfo->ops.dentry;
+
+ if (!is_inode_loaded(inode)) {
+ pset_member memb;
+ plugin_set *pset;
+
+ pset = reiser4_inode_data(inode)->pset;
+ for (memb = 0; memb < PSET_LAST; ++memb) {
+
+ if (aset_get(pset, memb) != NULL)
+ continue;
+
+ result = grab_plugin_pset(inode, NULL, memb);
+ if (result != 0)
+ break;
+
+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
+ }
+
+ if (result == 0) {
+ if (REISER4_DEBUG) {
+ for (memb = 0; memb < PSET_LAST; ++memb)
+ assert("nikita-3500",
+ aset_get(pset, memb) != NULL);
+ }
+ } else
+ warning("nikita-3448", "Cannot set plugins of root: %i",
+ result);
+ reiser4_iget_complete(inode);
+ /*
+ * As the default pset kept in the root dir may has been changed
+ * (length is unknown), call update_sd
+ */
+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
+ result = reiser4_grab_space(inode_file_plugin(inode)->
+ estimate.update(inode),
+ BA_CAN_COMMIT,
+ root_subv);
+ if (result == 0)
+ result = reiser4_update_sd(inode);
+ all_grabbed2free();
+ }
+ }
+ super->s_maxbytes = MAX_LFS_FILESIZE;
+ return result;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/init_volume.c linux-5.10.2/fs/reiser4/init_volume.c
--- linux-5.10.2.orig/fs/reiser4/init_volume.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/init_volume.c 2020-12-23 16:07:46.117813114 +0100
@@ -0,0 +1,1105 @@
+/*
+ Copyright (c) 2014-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/* Reiser4 logical volume initialization and activation */
+
+#include "debug.h"
+#include "super.h"
+#include "plugin/item/brick_symbol.h"
+#include "plugin/volume/volume.h"
+#include <linux/blkdev.h>
+
+DEFINE_MUTEX(reiser4_volumes_mutex);
+static LIST_HEAD(reiser4_volumes); /* list of registered volumes */
+
+#define MAX_STRIPE_BITS (63)
+
+/**
+ * Allocate and initialize a volume header.
+ *
+ * @uuid: unique volume ID;
+ * @vol_plug: volume plugin this volume is managed by;
+ * @dist_plug: plugin distributing stripes among bricks;
+ * @stripe_bits: defines size of stripe (minimal unit of distribution).
+ */
+static reiser4_volume *reiser4_alloc_volume(u8 *uuid,
+ volume_plugin *vol_plug,
+ distribution_plugin *dist_plug,
+ int stripe_bits)
+{
+ struct reiser4_volume *vol;
+
+ vol = kzalloc(sizeof(*vol), GFP_KERNEL);
+ if (!vol)
+ return NULL;
+ memcpy(vol->uuid, uuid, 16);
+ vol->vol_plug = vol_plug;
+ vol->dist_plug = dist_plug;
+ vol->stripe_bits = stripe_bits;
+
+ INIT_LIST_HEAD(&vol->list);
+ INIT_LIST_HEAD(&vol->subvols_list);
+ atomic_set(&vol->nr_origins, 0);
+ init_rwsem(&vol->volume_sem);
+ init_rwsem(&vol->brick_removal_sem);
+ return vol;
+}
+
+/**
+ * Allocate and initialize a brick header.
+ *
+ * @df_plug: disk format plugin this brick is managed by;
+ * @uuid: bricks's external ID;
+ * @subvol_id: bricks's internal ID;
+ * @mirror_id: 0 if brick is original. Serial number of replica otherwise;
+ * @num_replicas: total number of replicas
+ */
+struct reiser4_subvol *reiser4_alloc_subvol(u8 *uuid,
+ const char *path,
+ disk_format_plugin *df_plug,
+ u64 subvol_id,
+ u16 mirror_id,
+ u16 num_replicas)
+{
+ struct reiser4_subvol *subv;
+
+ subv = kzalloc(sizeof(*subv), GFP_KERNEL);
+ if (!subv)
+ return NULL;
+ memcpy(subv->uuid, uuid, 16);
+
+ INIT_LIST_HEAD(&subv->list);
+ __init_ch_sub(&subv->ch);
+
+ subv->name = kstrdup(path, GFP_KERNEL);
+ if (!subv->name) {
+ kfree(subv);
+ return NULL;
+ }
+ subv->df_plug = df_plug;
+ subv->id = subvol_id;
+ subv->mirror_id = mirror_id;
+ subv->num_replicas = num_replicas;
+ return subv;
+}
+
+/**
+ * Lookup volume by its ID.
+ * Pre-condition: @reiser4_volumes_mutex is down
+ */
+struct reiser4_volume *reiser4_search_volume(u8 *uuid)
+{
+ struct reiser4_volume *vol;
+
+ list_for_each_entry(vol, &reiser4_volumes, list) {
+ if (memcmp(uuid, vol->uuid, 16) == 0)
+ return vol;
+ }
+ return NULL;
+}
+
+/**
+ * Lookup brick by its external ID.
+ * Pre-condition: @reiser4_volumes_mutex is down
+ */
+static reiser4_subvol *reiser4_search_subvol(u8 *uuid,
+ struct list_head *where)
+{
+ reiser4_subvol *sub;
+
+ list_for_each_entry(sub, where, list) {
+ if (memcmp(uuid, sub->uuid, 16) == 0)
+ return sub;
+ }
+ return NULL;
+}
+
+static int check_volume_params(reiser4_volume *vol,
+ volume_plugin *vol_plug,
+ distribution_plugin *dist_plug,
+ int stripe_bits,
+ const char **what_differs)
+{
+ int ret = -EINVAL;
+
+ if (vol->vol_plug != vol_plug)
+ *what_differs = "volume plugins";
+ else if (vol->dist_plug != dist_plug)
+ *what_differs = "distribution plugins";
+ else if (vol->stripe_bits != stripe_bits)
+ *what_differs = "stripe sizes";
+ else
+ ret = 0;
+ return ret;
+}
+
+/**
+ * Register a brick.
+ * Returns:
+ * 0 - first time subvolume is seen
+ * 1 - subvolume already registered
+ * < 0 - error
+ *
+ * Pre-condition: @reiser4_volumes_mutex is down,
+ * all passed volume parameters are valid.
+ */
+static int reiser4_register_subvol(const char *path,
+ u8 *vol_uuid,
+ u8 *sub_uuid,
+ disk_format_plugin *df_plug,
+ volume_plugin *vol_plug,
+ distribution_plugin *dist_plug,
+ u16 mirror_id,
+ u16 num_replicas,
+ int stripe_bits,
+ u64 subvol_id,
+ reiser4_subvol **result,
+ reiser4_volume **vol)
+{
+ const char *what_differs;
+ struct reiser4_subvol *sub;
+
+ assert("edward-1964", vol != NULL);
+
+ *vol = reiser4_search_volume(vol_uuid);
+ if (*vol) {
+ int ret = check_volume_params(*vol,
+ vol_plug,
+ dist_plug,
+ stripe_bits,
+ &what_differs);
+ if (ret) {
+ /*
+ * Found, but not happy.
+ * Most likely it is because user specified
+ * wrong options when formatting bricks.
+ */
+ warning("edward-2317",
+ "%s: bricks w/ different %s in the same volume",
+ path, what_differs);
+ return ret;
+ }
+ sub = reiser4_search_subvol(sub_uuid, &(*vol)->subvols_list);
+ if (sub) {
+ if (result)
+ *result = sub;
+ return 1;
+ }
+ sub = reiser4_alloc_subvol(sub_uuid,
+ path,
+ df_plug,
+ subvol_id,
+ mirror_id, num_replicas);
+ if (!sub)
+ return -ENOMEM;
+ } else {
+ *vol = reiser4_alloc_volume(vol_uuid,
+ vol_plug,
+ dist_plug,
+ stripe_bits);
+ if (*vol == NULL)
+ return -ENOMEM;
+ sub = reiser4_alloc_subvol(sub_uuid,
+ path,
+ df_plug,
+ subvol_id,
+ mirror_id, num_replicas);
+ if (!sub) {
+ kfree(*vol);
+ return -ENOMEM;
+ }
+ list_add(&(*vol)->list, &reiser4_volumes);
+ }
+ list_add(&sub->list, &(*vol)->subvols_list);
+ if (result)
+ *result = sub;
+ notice("edward-1932", "brick %s has been registered", path);
+ return 0;
+}
+
+static void reiser4_free_volume(struct reiser4_volume *vol)
+{
+ assert("edward-1741", vol->conf == NULL);
+ kfree(vol);
+}
+
+/**
+ * Retrieve information about a registered volume.
+ * This is a REISER4_SCAN_DEV ioctl handler.
+ */
+int reiser4_volume_header(struct reiser4_vol_op_args *args)
+{
+ int idx = 0;
+ const struct reiser4_volume *vol;
+ const struct reiser4_volume *this = NULL;
+
+ mutex_lock(&reiser4_volumes_mutex);
+
+ list_for_each_entry(vol, &reiser4_volumes, list) {
+ if (idx == args->s.vol_idx) {
+ this = vol;
+ break;
+ }
+ idx ++;
+ }
+ if (!this) {
+ mutex_unlock(&reiser4_volumes_mutex);
+ args->error = -ENOENT;
+ return 0;
+ }
+ memcpy(args->u.vol.id, this->uuid, 16);
+ if (this->conf)
+ args->u.vol.fs_flags |= (1 << REISER4_ACTIVATED_VOL);
+
+ mutex_unlock(&reiser4_volumes_mutex);
+ return 0;
+}
+
+/**
+ * Retrieve information about a registered brick.
+ * This is a REISER4_SCAN_DEV ioctl handler.
+ *
+ * Pre-condition: @args contains uuid of the host volume and
+ * serial number of the brick in the list of volume's bricks.
+ */
+int reiser4_brick_header(struct reiser4_vol_op_args *args)
+{
+ int idx = 0;
+ const reiser4_volume *vol;
+ const reiser4_subvol *subv;
+ const reiser4_subvol *this = NULL;
+
+ mutex_lock(&reiser4_volumes_mutex);
+ vol = reiser4_search_volume(args->u.vol.id);
+ if (!vol) {
+ mutex_unlock(&reiser4_volumes_mutex);
+ args->error = -EINVAL;
+ return 0;
+ }
+ list_for_each_entry(subv, &vol->subvols_list, list) {
+ if (idx == args->s.brick_idx) {
+ this = subv;
+ break;
+ }
+ idx ++;
+ }
+ if (!this) {
+ mutex_unlock(&reiser4_volumes_mutex);
+ args->error = -ENOENT;
+ return 0;
+ }
+ memcpy(args->u.brick.ext_id, this->uuid, 16);
+ strncpy(args->d.name, this->name, REISER4_PATH_NAME_MAX + 1);
+ mutex_unlock(&reiser4_volumes_mutex);
+ return 0;
+}
+
+/**
+ * Remove @subv from volume's list of registered subvolumes and release it.
+ * Pre-condition: @reiser4_volumes_mutex is down.
+ */
+static void unregister_subvol_locked(struct reiser4_subvol *subv)
+{
+ assert("edward-1742", subv->bdev == NULL);
+ assert("edward-1743", subv->apx == NULL);
+ assert("edward-1744", !subvol_is_set(subv, SUBVOL_ACTIVATED));
+ assert("edward-1745", list_empty_careful(&subv->ch.overwrite_set));
+ assert("edward-1746", list_empty_careful(&subv->ch.tx_list));
+ assert("edward-1747", list_empty_careful(&subv->ch.wander_map));
+
+ notice("edward-2312", "brick %s has been unregistered", subv->name);
+
+ list_del_init(&subv->list);
+ if (subv->name)
+ kfree(subv->name);
+ kfree(subv);
+}
+
+/**
+ * Find a brick in the set of registered bricks, remove it
+ * from the set and release it.
+ *
+ * This is called when removing brick form a logical volume,
+ * and on error paths.
+ */
+void reiser4_unregister_subvol(struct reiser4_subvol *victim)
+{
+ struct reiser4_volume *vol, *vol_tmp;
+
+ mutex_lock(&reiser4_volumes_mutex);
+
+ list_for_each_entry_safe(vol, vol_tmp, &reiser4_volumes, list) {
+ struct reiser4_subvol *subv, *subv_tmp;
+ list_for_each_entry_safe(subv, subv_tmp,
+ &vol->subvols_list, list) {
+ if (subv == victim) {
+ unregister_subvol_locked(subv);
+ if (list_empty(&vol->subvols_list)) {
+ list_del(&vol->list);
+ reiser4_free_volume(vol);
+ }
+ goto out;
+ }
+ }
+ }
+ out:
+ mutex_unlock(&reiser4_volumes_mutex);
+}
+
+/**
+ * Find a brick in the list of registered bricks by name,
+ * remove it from the list and release it.
+ *
+ * This is a REISER4_SCAN_DEV ioctl handler.
+ */
+int reiser4_unregister_brick(struct reiser4_vol_op_args *args)
+{
+ int ret = 0;
+ struct reiser4_volume *vol, *vol_tmp;
+
+ mutex_lock(&reiser4_volumes_mutex);
+
+ list_for_each_entry_safe(vol, vol_tmp, &reiser4_volumes, list) {
+ struct reiser4_subvol *subv, *subv_tmp;
+ list_for_each_entry_safe(subv, subv_tmp,
+ &vol->subvols_list, list) {
+ if (!strncmp(args->d.name,
+ subv->name, strlen(subv->name))) {
+ if (subvol_is_set(subv, SUBVOL_ACTIVATED)) {
+ warning("edward-2314",
+ "Can not unregister activated brick %s",
+ subv->name);
+ ret = -EINVAL;
+ goto out;
+ }
+ unregister_subvol_locked(subv);
+ if (list_empty(&vol->subvols_list)) {
+ list_del(&vol->list);
+ reiser4_free_volume(vol);
+ }
+ goto out;
+ }
+ }
+ }
+ warning("edward-2313",
+ "Can not find registered brick %s", args->d.name);
+ ret = -EINVAL;
+ out:
+ mutex_unlock(&reiser4_volumes_mutex);
+ return ret;
+}
+
+/*
+ * Called on shutdown
+ */
+void reiser4_unregister_volumes(void)
+{
+ struct reiser4_volume *vol, *vol_tmp;
+ struct reiser4_subvol *sub, *sub_tmp;
+
+ mutex_lock(&reiser4_volumes_mutex);
+
+ list_for_each_entry_safe(vol, vol_tmp, &reiser4_volumes, list) {
+ list_for_each_entry_safe(sub, sub_tmp, &vol->subvols_list, list)
+ unregister_subvol_locked(sub);
+ assert("edward-2328", list_empty(&vol->subvols_list));
+ list_del(&vol->list);
+ reiser4_free_volume(vol);
+ }
+ assert("edward-2329", list_empty(&reiser4_volumes));
+
+ mutex_unlock(&reiser4_volumes_mutex);
+}
+
+/**
+ * read master super-block from disk and make its copy
+ */
+int reiser4_read_master_sb(struct block_device *bdev,
+ struct reiser4_master_sb *copy)
+{
+ struct page *page;
+ struct reiser4_master_sb *master;
+ /*
+ * read master super block
+ */
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ REISER4_MAGIC_OFFSET >> PAGE_SHIFT,
+ GFP_KERNEL);
+ if (IS_ERR_OR_NULL(page))
+ return -EINVAL;
+ master = kmap(page);
+ if (strncmp(master->magic,
+ REISER4_SUPER_MAGIC_STRING,
+ sizeof(REISER4_SUPER_MAGIC_STRING))) {
+ /*
+ * there is no reiser4 on the device
+ */
+ kunmap(page);
+ put_page(page);
+ return -EINVAL;
+ }
+ memcpy(copy, master, sizeof(*master));
+ kunmap(page);
+ put_page(page);
+ return 0;
+}
+
+/**
+ * Read master and format super-blocks from device specified by @path and
+ * check their magics.
+ * If found, then check parameters of found master and format super-blocks
+ * and try to register a brick accociated with this device.
+ * On success store pointer to registered subvolume in @result.
+ * On success return 0. Otherwise return error.
+ */
+int reiser4_scan_device(const char *path, fmode_t flags, void *holder,
+ reiser4_subvol **result, reiser4_volume **host)
+{
+ int ret;
+ u64 subv_id;
+ struct block_device *bdev;
+ struct reiser4_master_sb master;
+ u16 df_pid, dist_pid, vol_pid;
+ u8 stripe_bits = 0;
+ u16 mirror_id, nr_replicas;
+ disk_format_plugin *df_plug;
+ volume_plugin *vol_plug;
+ distribution_plugin *dist_plug = NULL;
+
+ mutex_lock(&reiser4_volumes_mutex);
+
+ bdev = blkdev_get_by_path(path, flags, holder);
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
+ goto out;
+ }
+ ret = reiser4_read_master_sb(bdev, &master);
+ if (ret)
+ goto bdev_put;
+
+ ret = -EINVAL;
+ df_pid = master_get_dformat_pid(&master);
+ df_plug = disk_format_plugin_by_unsafe_id(df_pid);
+ if (df_plug == NULL)
+ /* unknown disk format plugin */
+ goto bdev_put;
+
+ vol_pid = master_get_volume_pid(&master);
+ vol_plug = volume_plugin_by_unsafe_id(vol_pid);
+ if (!vol_plug)
+ /* unknown volume plugin */
+ goto bdev_put;
+
+ mirror_id = master_get_mirror_id(&master);
+ nr_replicas = master_get_num_replicas(&master);
+ if (mirror_id > nr_replicas) {
+ warning("edward-1739",
+ "%s: mirror id (%u) larger than number of replicas (%u)",
+ path, mirror_id, nr_replicas);
+ goto bdev_put;
+ }
+
+ dist_pid = master_get_distrib_pid(&master);
+ dist_plug = distribution_plugin_by_unsafe_id(dist_pid);
+ if (!dist_plug)
+ /* unknown distribution plugin */
+ goto bdev_put;
+
+ stripe_bits = master_get_stripe_bits(&master);
+ if (stripe_bits != 0 &&
+ stripe_bits < PAGE_SHIFT &&
+ stripe_bits > MAX_STRIPE_BITS) {
+ warning("edward-1814",
+ "bad stripe_bits value (%d)n", stripe_bits);
+ goto bdev_put;
+ }
+ /*
+ * Now retrieve subvolume's internal ID from format super-block.
+ * It is allowed to do before activating subvolume, because
+ * internal ID never get changed during subvolume's life.
+ * Thus, format super-block always contains actual version of
+ * internal ID (even before transaction replay).
+ */
+ ret = df_plug->extract_subvol_id(bdev, &subv_id);
+ if (ret)
+ goto bdev_put;
+ ret = reiser4_register_subvol(path,
+ master.uuid,
+ master.sub_uuid,
+ df_plug,
+ vol_plug,
+ dist_plug,
+ mirror_id,
+ nr_replicas,
+ stripe_bits,
+ subv_id,
+ result, host);
+ if (ret > 0)
+ /* ok, it was registered earlier */
+ ret = 0;
+ bdev_put:
+ blkdev_put(bdev, flags);
+ out:
+ mutex_unlock(&reiser4_volumes_mutex);
+ return ret;
+}
+
+/**
+ * Make sure that all replicas of the original subvolume
+ * @subv has been activated.
+ *
+ * Pre-conditions:
+ * Disk format superblock of the subvolume was found
+ */
+int check_active_replicas(reiser4_subvol *subv)
+{
+ u32 repl_id;
+ lv_conf *conf;
+
+ assert("edward-2235", subv->super != NULL);
+ assert("edward-1748", !is_replica(subv));
+ assert("edward-1751", super_volume(subv->super) != NULL);
+
+ conf = super_conf(subv->super);
+
+ if (has_replicas(subv) &&
+ (conf == NULL || conf_mslot_at(conf, subv->id) == NULL)) {
+
+ warning("edward-1750",
+ "%s requires replicas, which "
+ " are not registered.",
+ subv->name);
+ return -EINVAL;
+ }
+
+ __for_each_replica(subv, repl_id) {
+ reiser4_subvol *repl;
+
+ repl = super_mirror(subv->super, subv->id, repl_id);
+ if (repl == NULL) {
+ warning("edward-1752",
+ "%s: replica #%u is not registered.",
+ subv->name, repl_id);
+ return -EINVAL;
+ }
+ assert("edward-1965",
+ subvol_is_set(repl, SUBVOL_ACTIVATED));
+ }
+ return 0;
+}
+
+static void clear_subvol(reiser4_subvol *subv)
+{
+ subv->bdev = NULL;
+ subv->super = NULL;
+ subv->mode = 0;
+ subv->flags = 0;
+ subv->txmod = 0;
+}
+
+/*
+ * Initialize disk format 4.X.Y for a subvolume
+ * Pre-condition: subvolume @sub is registered
+ */
+int reiser4_activate_subvol(struct super_block *super,
+ reiser4_subvol *subv)
+{
+ int ret;
+ fmode_t mode = FMODE_READ | FMODE_EXCL;
+ reiser4_volume *vol = super_volume(super);
+
+ assert("edward-2309", vol != NULL);
+ assert("edward-2301", !subvol_is_set(subv, SUBVOL_ACTIVATED));
+
+ if (!(super->s_flags & SB_RDONLY))
+ mode |= FMODE_WRITE;
+
+ subv->bdev = blkdev_get_by_path(subv->name,
+ mode, get_reiser4_fs_type());
+ if (IS_ERR(subv->bdev))
+ return PTR_ERR(subv->bdev);
+
+ subv->mode = mode;
+ subv->super = super;
+
+ if (blk_queue_nonrot(bdev_get_queue(subv->bdev))) {
+ /*
+ * Solid state drive has been detected.
+ * Set Write-Anywhere transaction model
+ * for this subvolume
+ */
+ subv->flags |= (1 << SUBVOL_IS_NONROT_DEVICE);
+ subv->txmod = WA_TXMOD_ID;
+ }
+ if (is_replica(subv)) {
+ if (!vol->conf) {
+ /*
+ * This is a replica of meta-data brick.
+ * Allocate temporary config needed to
+ * activate original meta-data brick.
+ * This temporary config will be replaced
+ * with an actual one after replaying
+ * transactions on the meta-data brick
+ * (see read_check_volume_params()).
+ */
+ assert("edward-2310",
+ subv->id == METADATA_SUBVOL_ID);
+ vol->conf = alloc_lv_conf(1 /* one mslot */);
+ if (!vol->conf)
+ return -ENOMEM;
+ }
+ /*
+ * Nothing to do any more for replicas!
+ * Particularly, we are not entitled to
+ * replay journal on replicas (only on
+ * original bricks - it will also update
+ * replica blocks properly).
+ */
+ goto ok;
+ }
+ /*
+ * This is an original subvolume.
+ * Before calling ->init_format() make sure that
+ * all its replicas were activated.
+ */
+ ret = check_active_replicas(subv);
+ if (ret)
+ goto error;
+ ret = subv->df_plug->init_format(super, subv);
+ if (ret)
+ goto error;
+ ok:
+ printk("reiser4: brick %s activated\n", subv->name);
+ subv->flags |= (1 << SUBVOL_ACTIVATED);
+ return 0;
+ error:
+ blkdev_put(subv->bdev, subv->mode);
+ clear_subvol(subv);
+ return ret;
+}
+
+mirror_t *alloc_mslot(u32 nr_mirrors)
+{
+ return kzalloc(nr_mirrors * sizeof(mirror_t), GFP_KERNEL);
+}
+
+void free_mslot(slot_t slot)
+{
+ assert("edward-2229", slot != NULL);
+ kfree(slot);
+}
+
+lv_conf *alloc_lv_conf(u32 nr_mslots)
+{
+ lv_conf *ret;
+
+ ret = kzalloc(sizeof(*ret) + nr_mslots * sizeof(slot_t),
+ GFP_KERNEL);
+ if (ret)
+ ret->nr_mslots = nr_mslots;
+ return ret;
+}
+
+void free_lv_conf(lv_conf *conf)
+{
+ if (conf == NULL)
+ return;
+ if (conf->tab)
+ current_dist_plug()->r.free(conf->tab);
+ kfree(conf);
+}
+
+void free_mslot_at(lv_conf *conf, u64 idx)
+{
+ assert("edward-2231", conf != NULL);
+ assert("edward-2190", conf->mslots[idx] != NULL);
+
+ free_mslot(conf->mslots[idx]);
+ conf->mslots[idx] = NULL;
+}
+
+void release_lv_conf(reiser4_volume *vol, lv_conf *conf)
+{
+ u32 i;
+
+ assert("edward-2263", vol->conf == conf);
+
+ if (!conf)
+ return;
+ /*
+ * release distribution table
+ */
+ if (vol->dist_plug->r.done)
+ vol->dist_plug->r.done(&conf->tab);
+
+ assert("edward-2264", conf->tab == NULL);
+ /*
+ * release content of mslots
+ */
+ for (i = 0; i < conf->nr_mslots; i++)
+ if (conf->mslots[i])
+ free_mslot_at(conf, i);
+ free_lv_conf(conf);
+}
+
+/**
+ * Deactivate subvolume. Called during umount, or in error paths
+ */
+void reiser4_deactivate_subvol(struct super_block *super, reiser4_subvol *subv)
+{
+ assert("edward-1755", subvol_is_set(subv, SUBVOL_ACTIVATED));
+ assert("edward-1756", subv->bdev != NULL);
+ assert("edward-1757", subv->super != NULL);
+
+ if (!is_replica(subv)) {
+ subvol_check_block_counters(subv);
+ subv->df_plug->release_format(super, subv);
+ }
+ assert("edward-1758", list_empty_careful(&subv->ch.overwrite_set));
+ assert("edward-1759", list_empty_careful(&subv->ch.tx_list));
+ assert("edward-1760", list_empty_careful(&subv->ch.wander_map));
+
+ blkdev_put(subv->bdev, subv->mode);
+ clear_subvol(subv);
+ clear_bit(SUBVOL_ACTIVATED, &subv->flags);
+ printk("reiser4: brick %s deactivated\n", subv->name);
+}
+
+static void deactivate_subvolumes_cond(struct super_block *super,
+ int(*cond)(reiser4_subvol *))
+{
+ struct reiser4_subvol *subv;
+ reiser4_volume *vol = get_super_private(super)->vol;
+
+ list_for_each_entry(subv, &vol->subvols_list, list) {
+ if (!subvol_is_set(subv, SUBVOL_ACTIVATED)) {
+ /*
+ * subvolume is not active
+ */
+ assert("edward-1761", subv->super == NULL);
+ continue;
+ }
+ if (!cond(subv))
+ continue;
+ reiser4_deactivate_subvol(super, subv);
+ }
+}
+
+/**
+ * First we deactivate all non-replicas, as we need to have
+ * a complete set of active replicas for journal replay when
+ * deactivating original subvolumes.
+ */
+void __reiser4_deactivate_volume(struct super_block *super)
+{
+ int ret;
+ reiser4_subvol *subv;
+ reiser4_volume *vol = super_volume(super);
+ lv_conf *conf = vol->conf;
+
+ if (reiser4_volume_is_activated(super) && !sb_rdonly(super)) {
+ u32 orig_id;
+ for_each_mslot(conf, orig_id) {
+ if (!conf->mslots[orig_id])
+ continue;
+ subv = conf_origin(conf, orig_id);
+ if (!subvol_is_set(subv, SUBVOL_IS_ORPHAN)) {
+ ret = capture_brick_super(subv);
+ if (ret != 0)
+ warning("vs-898",
+ "Failed to capture superblock (%d)",
+ ret);
+ }
+ }
+ ret = txnmgr_force_commit_all(super, 1);
+ if (ret != 0)
+ warning("jmacd-74438",
+ "txn_force failed: %d", ret);
+ all_grabbed2free();
+ }
+ if (vol->vol_plug->done_volume)
+ vol->vol_plug->done_volume(vol);
+
+ deactivate_subvolumes_cond(super, is_origin);
+ deactivate_subvolumes_cond(super, is_replica);
+
+ if (vol->new_conf) {
+ assert("edward-2254",
+ reiser4_volume_has_incomplete_removal(super));
+ assert("edward-2255",
+ vol->new_conf->tab == vol->conf->tab);
+
+ vol->new_conf->tab = NULL;
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ }
+ vol->victim = NULL;
+
+ release_lv_conf(vol, vol->conf);
+ vol->conf = NULL;
+ vol->num_sgs_bits = 0;
+ atomic_set(&vol->nr_origins, 0);
+
+ assert("edward-2302", !list_empty(&vol->subvols_list));
+
+ list_for_each_entry(subv, &vol->subvols_list, list) {
+ assert("edward-1763", !subvol_is_set(subv, SUBVOL_ACTIVATED));
+ assert("edward-1764", subv->super == NULL);
+ assert("edward-1765", subv->bdev == NULL);
+ assert("edward-1766", subv->mode == 0);
+ }
+}
+
+/**
+ * Deactivate volume. Called during umount, or in error paths
+ */
+void reiser4_deactivate_volume(struct super_block *super)
+{
+ mutex_lock(&reiser4_volumes_mutex);
+ __reiser4_deactivate_volume(super);
+ mutex_unlock(&reiser4_volumes_mutex);
+}
+
+/**
+ * Set a pointer to activated subvolume @subv (original, or
+ * replica) at the respective slot in the table of activated
+ * subvolumes of logical volume @vol. Allocate column of the
+ * table, if needed.
+ */
+static int set_activated_subvol(reiser4_volume *vol, reiser4_subvol *subv)
+{
+ int ret = 0;
+ u64 orig_id = subv->id;
+ u16 mirr_id = subv->mirror_id;
+ lv_conf *conf = vol->conf;
+
+ assert("edward-2232", conf != NULL);
+
+ if (conf->mslots[orig_id] == NULL) {
+ /*
+ * slot is "empty". Allocate a "column" -
+ * array of pointers to mirrors
+ */
+ conf->mslots[orig_id] = alloc_mslot(1 + subv->num_replicas);
+ if (conf->mslots[orig_id] == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ if (conf->mslots[orig_id][mirr_id] != NULL) {
+ warning("edward-1767",
+ "wrong set of registered bricks: "
+ "%s and %s have identical mirror IDs (%llu,%u)",
+ conf->mslots[orig_id][mirr_id]->name,
+ subv->name,
+ orig_id, mirr_id);
+ ret = -EINVAL;
+ goto out;
+ }
+ conf->mslots[orig_id][mirr_id] = subv;
+ out:
+ return ret;
+}
+
+/**
+ * Activate all subvolumes of type specified by @cond
+ */
+static int activate_subvolumes_cond(struct super_block *super, u8 *vol_uuid,
+ int(*cond)(reiser4_subvol *))
+{
+ int ret;
+ struct reiser4_volume *vol;
+ struct reiser4_subvol *subv;
+ reiser4_super_info_data *info;
+
+ vol = reiser4_search_volume(vol_uuid);
+ if (!vol)
+ return -EINVAL;
+ info = get_super_private(super);
+ info->vol = vol;
+
+ list_for_each_entry(subv, &vol->subvols_list, list) {
+ if (list_empty_careful(&vol->subvols_list))
+ return 0;
+ if (!cond(subv))
+ continue;
+ if (subvol_is_set(subv, SUBVOL_ACTIVATED))
+ continue;
+ ret = reiser4_activate_subvol(super, subv);
+ if (ret)
+ return ret;
+ assert("edward-1769", subvol_is_set(subv, SUBVOL_ACTIVATED));
+
+ ret = set_activated_subvol(vol, subv);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int is_meta_origin(reiser4_subvol *subv)
+{
+ return is_meta_brick_id(subv->id) && is_origin(subv);
+}
+
+static int is_meta_replica(reiser4_subvol *subv)
+{
+ return is_meta_brick_id(subv->id) && is_replica(subv);
+}
+
+static int volume_version_update(struct super_block *super)
+{
+ int ret = 0;
+ u32 orig_id;
+ reiser4_volume *vol = get_super_private(super)->vol;
+ lv_conf *conf = vol->conf;
+ int nr_to_update = 0;
+
+ for_each_mslot(conf, orig_id) {
+ reiser4_subvol *subv;
+ if (!conf->mslots[orig_id])
+ continue;
+ subv = conf_origin(conf, orig_id);
+ ret = subv->df_plug->version_update(super, subv);
+ if (ret < 0)
+ return ret;
+ nr_to_update += ret;
+ }
+ if (!nr_to_update)
+ return 0;
+ /* force write_logs immediately */
+ return force_commit_current_atom();
+}
+
+/**
+ * Activate all subvolumes (components) of asymmetric logical
+ * volume in a particular order.
+ * Handle all cases of incomplete registration (when not all
+ * components were registered in the system).
+ *
+ * @super: super-block associated with the logical volume;
+ * @vol_uuid: uuid of the logical volume.
+ */
+int reiser4_activate_volume(struct super_block *super, u8 *vol_uuid)
+{
+ int ret;
+ u32 orig_id;
+ u32 nr_origins = 0;
+ reiser4_volume *vol;
+ lv_conf *conf;
+
+ mutex_lock(&reiser4_volumes_mutex);
+ /*
+ * Order of activation (don't change it).
+ *
+ * Before activating an original brick we need to activate
+ * all its replicas, because activation of original brick is
+ * followed with journal replay and for every IO submitted
+ * for the original brick we always immediately submit IOs
+ * for all its replicas. In contrast with original bricks,
+ * replicas are activated without journal replay.
+ *
+ * Besides, we need to start from the replica of meta-data
+ * brick, which contains system information needed to activate
+ * other (data) bricks.
+ */
+ ret = activate_subvolumes_cond(super, vol_uuid, is_meta_replica);
+ if (ret)
+ goto deactivate;
+ ret = activate_subvolumes_cond(super, vol_uuid, is_meta_origin);
+ if (ret)
+ goto deactivate;
+ ret = activate_subvolumes_cond(super, vol_uuid, is_replica);
+ if (ret)
+ goto deactivate;
+ ret = activate_subvolumes_cond(super, vol_uuid, is_origin);
+ if (ret)
+ goto deactivate;
+ /*
+ * At this point all activated original bricks have complete
+ * sets of active replicas - it is guaranteed by calling
+ * check_active_replicas() when activating an original brick.
+ * Now make sure that the set of original bricks is complete.
+ */
+ vol = get_super_private(super)->vol;
+ assert("edward-2207", vol != NULL);
+
+ conf = vol->conf;
+ for_each_mslot(conf, orig_id) {
+ if (conf_mslot_at(conf, orig_id) && conf_origin(conf, orig_id)){
+ assert("edward-1773",
+ subvol_is_set(conf_origin(conf, orig_id),
+ SUBVOL_ACTIVATED));
+ nr_origins ++;
+ }
+ }
+ if (nr_origins != atomic_read(&vol->nr_origins)) {
+ warning("edward-1772",
+ "%s: wrong set of registered bricks (found %u, expected %u)",
+ super->s_id, nr_origins, atomic_read(&vol->nr_origins));
+ ret = -EINVAL;
+ goto deactivate;
+ }
+ /*
+ * Identify activated subvolumes
+ */
+ if (!get_meta_subvol()) {
+ warning("edward-2298",
+ "%s: meta-data brick is not registered", super->s_id);
+ ret = -EINVAL;
+ goto deactivate;
+ }
+ for_each_data_mslot(conf, orig_id) {
+ reiser4_subvol *subv;
+ if (!conf_mslot_at(conf, orig_id) ||
+ !conf_origin(conf, orig_id))
+ continue;
+ subv = conf_origin(conf, orig_id);
+ if (!brick_identify(subv)) {
+ warning("edward-2299",
+ "%s: Brick %s doesn't match logical volume.",
+ super->s_id, subv->name);
+ ret = -EINVAL;
+ goto deactivate;
+ }
+ }
+ /*
+ * initialize logical volume after activating all subvolumes
+ */
+ if (vol->vol_plug->init_volume != NULL) {
+ ret = vol->vol_plug->init_volume(super, vol);
+ if (ret) {
+ warning("edward-1770",
+ "(%s): failed to init logical volume (%d)\n",
+ super->s_id, ret);
+ goto deactivate;
+ }
+ }
+ ret = volume_version_update(super);
+ if (ret)
+ goto deactivate;
+ reiser4_volume_set_activated(super);
+ goto out;
+ deactivate:
+ __reiser4_deactivate_volume(super);
+ out:
+ mutex_unlock(&reiser4_volumes_mutex);
+ return ret;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/inode.c linux-5.10.2/fs/reiser4/inode.c
--- linux-5.10.2.orig/fs/reiser4/inode.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/inode.c 2020-12-23 16:07:46.118813129 +0100
@@ -0,0 +1,734 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Inode specific operations. */
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "kassign.h"
+#include "coord.h"
+#include "seal.h"
+#include "dscale.h"
+#include "plugin/item/item.h"
+#include "plugin/security/perm.h"
+#include "plugin/plugin.h"
+#include "plugin/object.h"
+#include "znode.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/fs.h>
+
+/* return reiser4-specific inode flags */
+static inline unsigned long *inode_flags(const struct inode *const inode)
+{
+ assert("nikita-2842", inode != NULL);
+ return &reiser4_inode_data(inode)->flags;
+}
+
+/* set reiser4-specific flag @f in @inode */
+void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
+{
+ assert("nikita-2248", inode != NULL);
+ set_bit((int)f, inode_flags(inode));
+}
+
+/* clear reiser4-specific flag @f in @inode */
+void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
+{
+ assert("nikita-2250", inode != NULL);
+ clear_bit((int)f, inode_flags(inode));
+}
+
+/* true if reiser4-specific flag @f is set in @inode */
+int reiser4_inode_get_flag(const struct inode *inode,
+ reiser4_file_plugin_flags f)
+{
+ assert("nikita-2251", inode != NULL);
+ return test_bit((int)f, inode_flags(inode));
+}
+
+/* convert oid to inode number */
+ino_t oid_to_ino(oid_t oid)
+{
+ return (ino_t) oid;
+}
+
+/* convert oid to user visible inode number */
+ino_t oid_to_uino(oid_t oid)
+{
+ /* reiser4 object is uniquely identified by oid which is 64 bit
+ quantity. Kernel in-memory inode is indexed (in the hash table) by
+ 32 bit i_ino field, but this is not a problem, because there is a
+ way to further distinguish inodes with identical inode numbers
+ (find_actor supplied to iget()).
+
+ But user space expects unique 32 bit inode number. Obviously this
+ is impossible. Work-around is to somehow hash oid into user visible
+ inode number.
+ */
+ oid_t max_ino = (ino_t) ~0;
+
+ if (REISER4_INO_IS_OID || (oid <= max_ino))
+ return oid;
+ else
+ /* this is remotely similar to algorithm used to find next pid
+ to use for process: after wrap-around start from some
+ offset rather than from 0. Idea is that there are some long
+ living objects with which we don't want to collide.
+ */
+ return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
+}
+
+/* check that "inode" is on reiser4 file-system */
+int is_reiser4_inode(const struct inode *inode/* inode queried */)
+{
+ return inode != NULL && is_reiser4_super(inode->i_sb);
+}
+
+/* Maximal length of a name that can be stored in directory @inode.
+
+ This is used in check during file creation and lookup. */
+int reiser4_max_filename_len(const struct inode *inode/* inode queried */)
+{
+ assert("nikita-287", is_reiser4_inode(inode));
+ assert("nikita-1710", inode_dir_item_plugin(inode));
+ if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
+ return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
+ else
+ return 255;
+}
+
+#if REISER4_USE_COLLISION_LIMIT
+/* Maximal number of hash collisions for this directory. */
+int max_hash_collisions(const struct inode *dir/* inode queried */)
+{
+ assert("nikita-1711", dir != NULL);
+ return reiser4_inode_data(dir)->plugin.max_collisions;
+}
+#endif /* REISER4_USE_COLLISION_LIMIT */
+
+/* Install file, inode, and address_space operation on @inode, depending on
+ its mode. */
+int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
+ reiser4_object_create_data * data /* parameters to create
+ * object */ )
+{
+ file_plugin *fplug;
+ dir_plugin *dplug;
+
+ fplug = inode_file_plugin(inode);
+ dplug = inode_dir_plugin(inode);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFSOCK:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ {
+ dev_t rdev; /* to keep gcc happy */
+
+ assert("vs-46", fplug != NULL);
+ /* ugly hack with rdev */
+ if (data == NULL) {
+ rdev = inode->i_rdev;
+ inode->i_rdev = 0;
+ } else
+ rdev = data->rdev;
+ inode->i_blocks = 0;
+ assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
+ /* initialize inode->i_fop and inode->i_rdev for block
+ and char devices */
+ init_special_inode(inode, inode->i_mode, rdev);
+ /* all address space operations are null */
+ inode->i_mapping->a_ops =
+ file_plugins[fplug->h.id].as_ops;
+ break;
+ }
+ case S_IFLNK:
+ assert("vs-46", fplug != NULL);
+ assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
+ inode->i_fop = NULL;
+ /* all address space operations are null */
+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
+ break;
+ case S_IFDIR:
+ assert("vs-46", dplug != NULL);
+ assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
+ dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
+ inode->i_op = dir_plugins[dplug->h.id].inode_ops;
+ inode->i_fop = dir_plugins[dplug->h.id].file_ops;
+ inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
+ break;
+ case S_IFREG:
+ assert("vs-46", fplug != NULL);
+ assert("vs-43",
+ fplug->h.id == UNIX_FILE_PLUGIN_ID ||
+ fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID ||
+ fplug->h.id == STRIPED_FILE_PLUGIN_ID);
+ inode->i_op = file_plugins[fplug->h.id].inode_ops;
+ inode->i_fop = file_plugins[fplug->h.id].file_ops;
+ inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
+ break;
+ default:
+ warning("nikita-291", "wrong file mode: %o for %llu",
+ inode->i_mode,
+ (unsigned long long)get_inode_oid(inode));
+ reiser4_make_bad_inode(inode);
+ return RETERR(-EINVAL);
+ }
+ return 0;
+}
+
+/* Initialize inode from disk data. Called with inode locked.
+ Return inode locked. */
+static int init_inode(struct inode *inode /* inode to intialise */ ,
+ coord_t *coord/* coord of stat data */)
+{
+ int result;
+ item_plugin *iplug;
+ void *body;
+ int length;
+ reiser4_inode *state;
+
+ assert("nikita-292", coord != NULL);
+ assert("nikita-293", inode != NULL);
+
+ coord_clear_iplug(coord);
+ result = zload(coord->node);
+ if (result)
+ return result;
+ iplug = item_plugin_by_coord(coord);
+ body = item_body_by_coord(coord);
+ length = item_length_by_coord(coord);
+
+ assert("nikita-295", iplug != NULL);
+ assert("nikita-296", body != NULL);
+ assert("nikita-297", length > 0);
+
+ /* inode is under I_LOCK now */
+
+ state = reiser4_inode_data(inode);
+ /* call stat-data plugin method to load sd content into inode */
+ result = iplug->s.sd.init_inode(inode, body, length);
+ set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
+ if (result == 0) {
+ result = setup_inode_ops(inode, NULL);
+ if (result == 0 && inode->i_sb->s_root &&
+ inode->i_sb->s_root->d_inode)
+ result = finish_pset(inode);
+ }
+ zrelse(coord->node);
+ return result;
+}
+
+/**
+ * Read @inode from the disk.
+ * This is what was previously in reiserfs_read_inode2().
+ * Must be called with inode locked. Return inode still locked.
+ *
+ * @key: key of stat-data
+ * @bias: lookup bias -
+ * FIND_EXACT, if is the key is precise,
+ * FIND_MAX_NOT_MORE_THAN, if we don't know ordering
+ * component of the key
+ */
+static int read_inode(struct inode *inode, const reiser4_key *key,
+ int bias, int silent)
+{
+ int ret;
+ lock_handle lh;
+ reiser4_inode *info;
+ coord_t coord;
+
+ assert("nikita-298", inode != NULL);
+ assert("nikita-1945", !is_inode_loaded(inode));
+
+ info = reiser4_inode_data(inode);
+ assert("nikita-300", info->locality_id != 0);
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+ /*
+ * locate stat-data in a tree and return znode locked
+ */
+ ret = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh,
+ key, bias, silent);
+ assert("nikita-301", !is_inode_loaded(inode));
+ if (ret)
+ goto error;
+
+ if (bias == FIND_MAX_NOT_MORE_THAN) {
+ reiser4_key ikey;
+ /*
+ * check found coord -
+ * make sure that key of found item coincides
+ * with @key in all components except ordering
+ */
+ ret = zload(coord.node);
+ if (ret)
+ goto error;
+ unit_key_by_coord(&coord, &ikey);
+ zrelse(coord.node);
+
+ set_key_ordering((reiser4_key *)key, get_key_ordering(&ikey));
+
+ if (!keyeq(&ikey, key)) {
+ /* Stat-data killed by concurrent unlink */
+#if REISER4_DEBUG
+ warning("edward-2134",
+ "inode %llu: stat-data not found by extent",
+ (unsigned long long)get_inode_oid(inode));
+ reiser4_print_key("found", &ikey);
+#endif
+ ret = -ENOENT;
+ goto error;
+ }
+ }
+ /*
+ * load stat-data extensions into inode
+ */
+ ret = init_inode(inode, &coord);
+ if (ret)
+ goto error;
+
+ spin_lock_inode(inode);
+ reiser4_seal_init(&info->sd_seal, &coord, key);
+ info->sd_coord = coord;
+ spin_unlock_inode(inode);
+ /*
+ * call file plugin's method to initialize plugin
+ * specific part of inode
+ */
+ if (inode_file_plugin(inode)->init_inode_data)
+ inode_file_plugin(inode)->init_inode_data(inode, NULL, key, 0);
+ /*
+ * load detached directory cursors for
+ * stateless directory readers (NFS)
+ */
+ reiser4_load_cursors(inode);
+
+ /* check the inode for consistency */
+ ret = get_meta_subvol()->df_plug->check_open(inode);
+ /*
+ * lookup_sd() doesn't release coord because we want znode
+ * stay read-locked while stat-data fields are accessed in
+ * init_inode()
+ */
+ done_lh(&lh);
+ return 0;
+ error:
+ done_lh(&lh);
+ reiser4_make_bad_inode(inode);
+ return ret;
+}
+
+/* initialise new reiser4 inode being inserted into hash table. */
+static int init_locked_inode(struct inode *inode /* new inode */ ,
+ void *opaque /* key of stat data passed to
+ * the iget5_locked as cookie */)
+{
+ reiser4_key *key;
+
+ assert("nikita-1995", inode != NULL);
+ assert("nikita-1996", opaque != NULL);
+ key = opaque;
+ set_inode_oid(inode, get_key_objectid(key));
+ reiser4_inode_data(inode)->locality_id = get_key_locality(key);
+ return 0;
+}
+
+/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to
+ iget5_locked().
+
+ This function is called by iget5_locked() to distinguish reiser4 inodes
+ having the same inode numbers. Such inodes can only exist due to some error
+ condition. One of them should be bad. Inodes with identical inode numbers
+ (objectids) are distinguished by their packing locality.
+
+*/
+static int reiser4_inode_find_actor(struct inode *inode /* inode from hash table
+ * to check */ ,
+ void *opaque /* "cookie" passed to
+ * iget5_locked(). This
+ * is stat-data key */)
+{
+ reiser4_key *key;
+
+ key = opaque;
+ return
+ /* oid is unique, so first term is enough, actually. */
+ get_inode_oid(inode) == get_key_objectid(key) &&
+ /*
+ * also, locality should be checked, but locality is stored in
+ * the reiser4-specific part of the inode, and actor can be
+ * called against arbitrary inode that happened to be in this
+ * hash chain. Hence we first have to check that this is
+ * reiser4 inode at least. is_reiser4_inode() is probably too
+ * early to call, as inode may have ->i_op not yet
+ * initialised.
+ */
+ is_reiser4_super(inode->i_sb) &&
+ /*
+ * usually objectid is unique, but pseudo files use counter to
+ * generate objectid. All pseudo files are placed into special
+ * (otherwise unused) locality.
+ */
+ reiser4_inode_data(inode)->locality_id == get_key_locality(key);
+}
+
+/* hook for kmem_cache_create */
+void loading_init_once(reiser4_inode * info)
+{
+ mutex_init(&info->loading);
+}
+
+/* for reiser4_alloc_inode */
+void loading_alloc(reiser4_inode * info)
+{
+ assert("vs-1717", !mutex_is_locked(&info->loading));
+}
+
+/* for reiser4_destroy */
+void loading_destroy(reiser4_inode * info)
+{
+ assert("vs-1717a", !mutex_is_locked(&info->loading));
+}
+
+static void loading_begin(reiser4_inode * info)
+{
+ mutex_lock(&info->loading);
+}
+
+static void loading_end(reiser4_inode * info)
+{
+ mutex_unlock(&info->loading);
+}
+
+/**
+ * Obtain inode via iget5_locked, read from disk if necessary.
+ * This is helper function a la iget() called by lookup_common() and
+ * reiser4_read_super(). Return inode locked or error encountered.
+ *
+ * @super: super block of filesystem
+ * @key: key of inode's stat-data
+ * @bias: lookup bias -
+ * FIND_EXACT, if the key is precise,
+ * FIND_MAX_NOT_MORE_THAN, if we don't know ordering component
+ * of the key
+ */
+struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
+ lookup_bias bias, int silent)
+{
+ struct inode *inode;
+ int result;
+ reiser4_inode *info;
+
+ assert("nikita-302", super != NULL);
+ assert("nikita-303", key != NULL);
+
+ result = 0;
+
+ /* call iget(). Our ->read_inode() is dummy, so this will either
+ find inode in cache or return uninitialised inode */
+ inode = iget5_locked(super,
+ (unsigned long)get_key_objectid(key),
+ reiser4_inode_find_actor,
+ init_locked_inode, (reiser4_key *) key);
+ if (inode == NULL)
+ return ERR_PTR(RETERR(-ENOMEM));
+ if (is_bad_inode(inode)) {
+ warning("nikita-304", "Bad inode found");
+ reiser4_print_key("key", key);
+ iput(inode);
+ return ERR_PTR(RETERR(-EIO));
+ }
+
+ info = reiser4_inode_data(inode);
+
+ /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
+ loaded and initialized inode from just allocated inode. If
+ REISER4_LOADED bit is not set, reiser4_iget() completes loading under
+ info->loading. The place in reiser4 which uses not initialized inode
+ is the reiser4 repacker, see repacker-related functions in
+ plugin/item/extent.c */
+ if (!is_inode_loaded(inode)) {
+ loading_begin(info);
+ if (!is_inode_loaded(inode)) {
+ /* locking: iget5_locked returns locked inode */
+ assert("nikita-1941", !is_inode_loaded(inode));
+ assert("nikita-1949",
+ reiser4_inode_find_actor(inode,
+ (reiser4_key *) key));
+ /* now, inode has objectid as ->i_ino and locality in
+ reiser4-specific part. This is enough for
+ read_inode() to read stat data from the disk */
+ result = read_inode(inode, key, bias, silent);
+ } else
+ loading_end(info);
+ }
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+
+ if (is_bad_inode(inode)) {
+ assert("vs-1717", result != 0);
+ loading_end(info);
+ iput(inode);
+ inode = ERR_PTR(result);
+ } else if (REISER4_DEBUG) {
+ reiser4_key found_key;
+
+ assert("vs-1717", result == 0);
+ build_sd_key(inode, &found_key);
+
+ if (bias == FIND_EXACT &&
+ !keyeq(build_sd_key(inode, &found_key), key)) {
+ warning("nikita-305", "Wrong key in sd");
+ reiser4_print_key("sought for", key);
+ reiser4_print_key("found", &found_key);
+ }
+ }
+ return inode;
+}
+
+/* reiser4_iget() may return not fully initialized inode, this function should
+ * be called after one completes reiser4 inode initializing. */
+void reiser4_iget_complete(struct inode *inode)
+{
+ assert("zam-988", is_reiser4_inode(inode));
+
+ if (!is_inode_loaded(inode)) {
+ reiser4_inode_set_flag(inode, REISER4_LOADED);
+ loading_end(reiser4_inode_data(inode));
+ }
+}
+
+void reiser4_make_bad_inode(struct inode *inode)
+{
+ assert("nikita-1934", inode != NULL);
+
+ /* clear LOADED bit */
+ reiser4_inode_clr_flag(inode, REISER4_LOADED);
+ make_bad_inode(inode);
+ return;
+}
+
+file_plugin *inode_file_plugin(const struct inode *inode)
+{
+ assert("nikita-1997", inode != NULL);
+ return reiser4_inode_data(inode)->pset->file;
+}
+
+dir_plugin *inode_dir_plugin(const struct inode *inode)
+{
+ assert("nikita-1998", inode != NULL);
+ return reiser4_inode_data(inode)->pset->dir;
+}
+
+formatting_plugin *inode_formatting_plugin(const struct inode *inode)
+{
+ assert("nikita-2000", inode != NULL);
+ return reiser4_inode_data(inode)->pset->formatting;
+}
+
+hash_plugin *inode_hash_plugin(const struct inode *inode)
+{
+ assert("nikita-2001", inode != NULL);
+ return reiser4_inode_data(inode)->pset->hash;
+}
+
+fibration_plugin *inode_fibration_plugin(const struct inode *inode)
+{
+ assert("nikita-2001", inode != NULL);
+ return reiser4_inode_data(inode)->pset->fibration;
+}
+
+cipher_plugin *inode_cipher_plugin(const struct inode *inode)
+{
+ assert("edward-36", inode != NULL);
+ return reiser4_inode_data(inode)->pset->cipher;
+}
+
+compression_plugin *inode_compression_plugin(const struct inode *inode)
+{
+ assert("edward-37", inode != NULL);
+ return reiser4_inode_data(inode)->pset->compression;
+}
+
+compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
+ inode)
+{
+ assert("edward-1330", inode != NULL);
+ return reiser4_inode_data(inode)->pset->compression_mode;
+}
+
+cluster_plugin *inode_cluster_plugin(const struct inode *inode)
+{
+ assert("edward-1328", inode != NULL);
+ return reiser4_inode_data(inode)->pset->cluster;
+}
+
+file_plugin *inode_create_plugin(const struct inode *inode)
+{
+ assert("edward-1329", inode != NULL);
+ return reiser4_inode_data(inode)->pset->create;
+}
+
+digest_plugin *inode_digest_plugin(const struct inode *inode)
+{
+ assert("edward-86", inode != NULL);
+ return reiser4_inode_data(inode)->pset->digest;
+}
+
+item_plugin *inode_sd_plugin(const struct inode *inode)
+{
+ assert("vs-534", inode != NULL);
+ return reiser4_inode_data(inode)->pset->sd;
+}
+
+item_plugin *inode_dir_item_plugin(const struct inode *inode)
+{
+ assert("vs-534", inode != NULL);
+ return reiser4_inode_data(inode)->pset->dir_item;
+}
+
+file_plugin *child_create_plugin(const struct inode *inode)
+{
+ assert("edward-1329", inode != NULL);
+ return reiser4_inode_data(inode)->hset->create;
+}
+
+void inode_set_extension(struct inode *inode, sd_ext_bits ext)
+{
+ reiser4_inode *state;
+
+ assert("nikita-2716", inode != NULL);
+ assert("nikita-2717", ext < LAST_SD_EXTENSION);
+ assert("nikita-3491", spin_inode_is_locked(inode));
+
+ state = reiser4_inode_data(inode);
+ state->extmask |= 1 << ext;
+ /* force re-calculation of stat-data length on next call to
+ update_sd(). */
+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
+}
+
+void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
+{
+ reiser4_inode *state;
+
+ assert("vpf-1926", inode != NULL);
+ assert("vpf-1927", ext < LAST_SD_EXTENSION);
+ assert("vpf-1928", spin_inode_is_locked(inode));
+
+ state = reiser4_inode_data(inode);
+ state->extmask &= ~(1 << ext);
+ /* force re-calculation of stat-data length on next call to
+ update_sd(). */
+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
+}
+
+void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
+{
+ assert("edward-1287", inode != NULL);
+ if (!dscale_fit(old, new))
+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
+ return;
+}
+
+void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
+{
+ assert("nikita-2875", inode != NULL);
+ spin_lock_inode(inode);
+ inode_check_scale_nolock(inode, old, new);
+ spin_unlock_inode(inode);
+}
+
+/**
+ * initialize ->ordering field of inode. This field defines how file stat-data
+ * and body is ordered within a tree with respect to other objects within the
+ * same parent directory.
+ */
+void init_inode_ordering(struct inode *inode, reiser4_object_create_data *crd,
+ const reiser4_key *sd_key, int create)
+{
+ if (create) {
+ reiser4_key key;
+ struct inode *parent;
+
+ assert("edward-2210", crd != NULL);
+ parent = crd->parent;
+ assert("nikita-3224", inode_dir_plugin(parent) != NULL);
+ inode_dir_plugin(parent)->build_entry_key(parent,
+ &crd->dentry->d_name,
+ &key);
+ set_inode_ordering(inode, get_key_ordering(&key));
+ } else {
+ assert("edward-2211", sd_key != NULL);
+ set_inode_ordering(inode, get_key_ordering(sd_key));
+ }
+}
+
+znode *inode_get_vroot(struct inode *inode)
+{
+ reiser4_block_nr blk;
+ znode *result;
+
+ spin_lock_inode(inode);
+ blk = reiser4_inode_data(inode)->vroot;
+ spin_unlock_inode(inode);
+ if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
+ result = zlook(meta_subvol_tree(), &blk);
+ else
+ result = NULL;
+ return result;
+}
+
+void inode_set_vroot(struct inode *inode, znode *vroot)
+{
+ spin_lock_inode(inode);
+ reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
+ spin_unlock_inode(inode);
+}
+
+#if REISER4_DEBUG
+
+void reiser4_inode_invariant(const struct inode *inode)
+{
+ assert("nikita-3077", spin_inode_is_locked(inode));
+}
+
+int inode_has_no_jnodes(reiser4_inode * r4_inode)
+{
+ return radix_tree_empty(jnode_tree_by_reiser4_inode(r4_inode)) &&
+ r4_inode->nr_jnodes == 0;
+}
+
+#endif
+
+/* true if directory is empty (only contains dot and dotdot) */
+/* FIXME: shouldn't it be dir plugin method? */
+int is_dir_empty(const struct inode *dir)
+{
+ assert("nikita-1976", dir != NULL);
+
+ /* rely on our method to maintain directory i_size being equal to the
+ number of entries. */
+ return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/inode.h linux-5.10.2/fs/reiser4/inode.h
--- linux-5.10.2.orig/fs/reiser4/inode.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/inode.h 2020-12-23 16:07:46.118813129 +0100
@@ -0,0 +1,506 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* Inode functions. */
+
+#if !defined(__REISER4_INODE_H__)
+#define __REISER4_INODE_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "seal.h"
+#include "plugin/plugin.h"
+#include "plugin/file/cryptcompress.h"
+#include "plugin/file/file.h"
+#include "plugin/dir/dir.h"
+#include "plugin/plugin_set.h"
+#include "plugin/security/perm.h"
+#include "vfs_ops.h"
+#include "jnode.h"
+#include "fsdata.h"
+
+#include <linux/types.h> /* for __u?? , ino_t */
+#include <linux/fs.h> /* for struct super_block, struct
+ * rw_semaphore, etc */
+#include <linux/spinlock.h>
+#include <asm/types.h>
+
+/* reiser4-specific inode flags. They are "transient" and are not
+ supposed to be stored on disk. Used to trace "state" of
+ inode
+*/
+typedef enum {
+ /* this is light-weight inode, inheriting some state from its
+ parent */
+ REISER4_LIGHT_WEIGHT = 0,
+ /* stat data wasn't yet created */
+ REISER4_NO_SD = 1,
+ /* internal immutable flag. Currently is only used
+ to avoid race condition during file creation.
+ See comment in create_object(). */
+ REISER4_IMMUTABLE = 2,
+ /* inode was read from storage */
+ REISER4_LOADED = 3,
+ /* this bit is set for symlinks. inode->i_private points to target
+ name of symlink. */
+ REISER4_GENERIC_PTR_USED = 4,
+ /* set if size of stat-data item for this inode is known. If this is
+ * set we can avoid recalculating size of stat-data on each update. */
+ REISER4_SDLEN_KNOWN = 5,
+ /* reiser4_inode->crypt points to the crypto stat */
+ REISER4_CRYPTO_STAT_LOADED = 6,
+ /* cryptcompress_inode_data points to the secret key */
+ REISER4_SECRET_KEY_INSTALLED = 7,
+ /* File (possibly) has pages corresponding to the tail items, that
+ * were created by ->readpage. It is set by mmap_unix_file() and
+ * sendfile_unix_file(). This bit is inspected by write_unix_file and
+ * kill-hook of tail items. It is never cleared once set. This bit is
+ * modified and inspected under i_mutex. */
+ REISER4_HAS_MMAP = 8,
+ REISER4_FAKE_IMODE_ONDISK = 9,
+ REISER4_PART_MIXED = REISER4_FAKE_IMODE_ONDISK,
+ REISER4_FILE_IMMOBILE = REISER4_FAKE_IMODE_ONDISK,
+ REISER4_PART_IN_CONV = 10,
+ REISER4_FILE_IN_CONVERSION = 11,
+ REISER4_FILE_IN_MIGRATION = 12,
+} reiser4_file_plugin_flags;
+
+/* state associated with each inode.
+ reiser4 inode.
+
+ NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
+ be of the same size. File-system allocates inodes by itself through
+ s_op->allocate_inode() method. So, it is possible to adjust size of inode
+ at the time of its creation.
+
+ Invariants involving parts of this data-type:
+
+ [inode->eflushed]
+
+*/
+
+typedef struct reiser4_inode reiser4_inode;
+/* return pointer to reiser4-specific part of inode */
+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
+ /* inode queried */ );
+
+#if BITS_PER_LONG == 64
+
+#define REISER4_INO_IS_OID (1)
+typedef struct {;
+} oid_hi_t;
+
+/* BITS_PER_LONG == 64 */
+#else
+
+#define REISER4_INO_IS_OID (0)
+typedef __u32 oid_hi_t;
+
+/* BITS_PER_LONG == 64 */
+#endif
+
+struct reiser4_inode {
+ /* spin lock protecting fields of this structure. */
+ spinlock_t guard;
+ /* main plugin set that control the file
+ (see comments in plugin/plugin_set.c) */
+ plugin_set *pset;
+ /* plugin set for inheritance
+ (see comments in plugin/plugin_set.c) */
+ plugin_set *hset;
+ /* high 32 bits of object id */
+ oid_hi_t oid_hi;
+ /* seal for stat-data */
+ seal_t sd_seal;
+ /* locality id for this file */
+ oid_t locality_id;
+#if REISER4_LARGE_KEY
+ __u64 ordering;
+#endif
+ /* coord of stat-data in sealed node */
+ coord_t sd_coord;
+ /* bit-mask of stat-data extentions used by this file */
+ __u64 extmask;
+ /* bitmask of non-default plugins for this inode */
+ __u16 plugin_mask;
+ /* bitmask of set heir plugins for this inode. */
+ __u16 heir_mask;
+ union {
+ struct list_head readdir_list;
+ struct list_head not_used;
+ } lists;
+ /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
+ unsigned long flags;
+ union {
+ /* fields specific to unix_file plugin */
+ struct unix_file_info unix_file_info;
+ /* fields specific to cryptcompress file plugin */
+ struct cryptcompress_info cryptcompress_info;
+ } file_plugin_data;
+
+ /* this semaphore is to serialize readers and writers of @pset->file
+ * when file plugin conversion is enabled
+ */
+ struct rw_semaphore conv_sem;
+
+ /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
+ tagged in that tree by EFLUSH_TAG_ANONYMOUS */
+ struct radix_tree_root jnodes_tree;
+#if REISER4_DEBUG
+ /* number of unformatted node jnodes of this file in jnode hash table */
+ unsigned long nr_jnodes;
+#endif
+ /* block number of virtual root for this object. See comment above
+ * fs/reiser4/search.c:handle_vroot() */
+ reiser4_block_nr vroot;
+ struct mutex loading;
+};
+
+void loading_init_once(reiser4_inode *);
+void loading_alloc(reiser4_inode *);
+void loading_destroy(reiser4_inode *);
+
+struct reiser4_inode_object {
+ /* private part */
+ reiser4_inode p;
+ /* generic fields not specific to reiser4, but used by VFS */
+ struct inode vfs_inode;
+};
+
+/* return pointer to the reiser4 specific portion of @inode */
+static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
+ /* inode queried */ )
+{
+ assert("nikita-254", inode != NULL);
+ return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
+}
+
+static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
+ r4_inode /* inode queried */
+ )
+{
+ return &container_of(r4_inode, struct reiser4_inode_object,
+ p)->vfs_inode;
+}
+
+/*
+ * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
+ * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
+ * bits.
+ *
+ * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
+ * of inode, otherwise whole oid is stored in i_ino.
+ *
+ * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
+ */
+
+#define OID_HI_SHIFT (sizeof(ino_t) * 8)
+
+#if REISER4_INO_IS_OID
+
+static inline oid_t get_inode_oid(const struct inode *inode)
+{
+ return inode->i_ino;
+}
+
+static inline void set_inode_oid(struct inode *inode, oid_t oid)
+{
+ inode->i_ino = oid;
+}
+
+/* REISER4_INO_IS_OID */
+#else
+
+static inline oid_t get_inode_oid(const struct inode *inode)
+{
+ return
+ ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
+ inode->i_ino;
+}
+
+static inline void set_inode_oid(struct inode *inode, oid_t oid)
+{
+ assert("nikita-2519", inode != NULL);
+ inode->i_ino = (ino_t) (oid);
+ reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
+ assert("nikita-2521", get_inode_oid(inode) == (oid));
+}
+
+/* REISER4_INO_IS_OID */
+#endif
+
+static inline oid_t get_inode_locality(const struct inode *inode)
+{
+ return reiser4_inode_data(inode)->locality_id;
+}
+
+#if REISER4_LARGE_KEY
+static inline __u64 get_inode_ordering(const struct inode *inode)
+{
+ return reiser4_inode_data(inode)->ordering;
+}
+
+static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
+{
+ reiser4_inode_data(inode)->ordering = ordering;
+}
+
+#else
+
+#define get_inode_ordering(inode) (0)
+#define set_inode_ordering(inode, val) noop
+
+#endif
+
+/* return inode in which @uf_info is embedded */
+static inline struct inode *
+unix_file_info_to_inode(const struct unix_file_info *uf_info)
+{
+ return &container_of(uf_info, struct reiser4_inode_object,
+ p.file_plugin_data.unix_file_info)->vfs_inode;
+}
+
+extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
+extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
+
+#if REISER4_DEBUG
+extern void reiser4_inode_invariant(const struct inode *inode);
+extern int inode_has_no_jnodes(reiser4_inode *);
+#else
+#define reiser4_inode_invariant(inode) noop
+#endif
+
+static inline int spin_inode_is_locked(const struct inode *inode)
+{
+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
+ return 1;
+}
+
+/**
+ * spin_lock_inode - lock reiser4_inode' embedded spinlock
+ * @inode: inode to lock
+ *
+ * In debug mode it checks that lower priority locks are not held and
+ * increments reiser4_context's lock counters on which lock ordering checking
+ * is based.
+ */
+static inline void spin_lock_inode(struct inode *inode)
+{
+ assert("", LOCK_CNT_NIL(spin_locked));
+ /* check lock ordering */
+ assert_spin_not_locked(&d_c_lock);
+
+ spin_lock(&reiser4_inode_data(inode)->guard);
+
+ LOCK_CNT_INC(spin_locked_inode);
+ LOCK_CNT_INC(spin_locked);
+
+ reiser4_inode_invariant(inode);
+}
+
+/**
+ * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
+ * @inode: inode to unlock
+ *
+ * In debug mode it checks that spinlock is held and decrements
+ * reiser4_context's lock counters on which lock ordering checking is based.
+ */
+static inline void spin_unlock_inode(struct inode *inode)
+{
+ assert_spin_locked(&reiser4_inode_data(inode)->guard);
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ reiser4_inode_invariant(inode);
+
+ LOCK_CNT_DEC(spin_locked_inode);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&reiser4_inode_data(inode)->guard);
+}
+
+extern znode *inode_get_vroot(struct inode *inode);
+extern void inode_set_vroot(struct inode *inode, znode * vroot);
+
+extern int reiser4_max_filename_len(const struct inode *inode);
+extern int max_hash_collisions(const struct inode *dir);
+extern void reiser4_unlock_inode(struct inode *inode);
+extern int is_reiser4_inode(const struct inode *inode);
+extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
+extern struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
+ lookup_bias bias, int silent);
+extern void reiser4_iget_complete(struct inode *inode);
+extern void reiser4_inode_set_flag(struct inode *inode,
+ reiser4_file_plugin_flags f);
+extern void reiser4_inode_clr_flag(struct inode *inode,
+ reiser4_file_plugin_flags f);
+extern int reiser4_inode_get_flag(const struct inode *inode,
+ reiser4_file_plugin_flags f);
+
+/* has inode been initialized? */
+static inline int
+is_inode_loaded(const struct inode *inode/* inode queried */)
+{
+ assert("nikita-1120", inode != NULL);
+ return reiser4_inode_get_flag(inode, REISER4_LOADED);
+}
+
+extern file_plugin *inode_file_plugin(const struct inode *inode);
+extern dir_plugin *inode_dir_plugin(const struct inode *inode);
+extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
+extern hash_plugin *inode_hash_plugin(const struct inode *inode);
+extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
+extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
+extern digest_plugin *inode_digest_plugin(const struct inode *inode);
+extern compression_plugin *inode_compression_plugin(const struct inode *inode);
+extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
+ *inode);
+extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
+extern file_plugin *inode_create_plugin(const struct inode *inode);
+extern item_plugin *inode_sd_plugin(const struct inode *inode);
+extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
+extern file_plugin *child_create_plugin(const struct inode *inode);
+
+extern void reiser4_make_bad_inode(struct inode *inode);
+
+extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
+extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
+extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
+extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new);
+
+#define INODE_SET_SIZE(i, value) \
+({ \
+ struct inode *__i; \
+ typeof(value) __v; \
+ \
+ __i = (i); \
+ __v = (value); \
+ inode_check_scale(__i, __i->i_size, __v); \
+ i_size_write(__i, __v); \
+})
+
+/*
+ * update field @field in inode @i to contain value @value.
+ */
+#define INODE_SET_FIELD(i, field, value) \
+({ \
+ struct inode *__i; \
+ typeof(value) __v; \
+ \
+ __i = (i); \
+ __v = (value); \
+ inode_check_scale(__i, __i->field, __v); \
+ __i->field = __v; \
+})
+
+#define INODE_INC_FIELD(i, field) \
+({ \
+ struct inode *__i; \
+ \
+ __i = (i); \
+ inode_check_scale(__i, __i->field, __i->field + 1); \
+ ++ __i->field; \
+})
+
+#define INODE_DEC_FIELD(i, field) \
+({ \
+ struct inode *__i; \
+ \
+ __i = (i); \
+ inode_check_scale(__i, __i->field, __i->field - 1); \
+ -- __i->field; \
+})
+
+/*
+ * Update field i_nlink in inode @i using library function @op.
+ */
+#define INODE_SET_NLINK(i, value) \
+({ \
+ struct inode *__i; \
+ typeof(value) __v; \
+ \
+ __i = (i); \
+ __v = (value); \
+ inode_check_scale(__i, __i->i_nlink, __v); \
+ set_nlink(__i, __v); \
+})
+
+#define INODE_INC_NLINK(i) \
+ ({ \
+ struct inode *__i; \
+ \
+ __i = (i); \
+ inode_check_scale(__i, __i->i_nlink, __i->i_nlink + 1); \
+ inc_nlink(__i); \
+})
+
+#define INODE_DROP_NLINK(i) \
+ ({ \
+ struct inode *__i; \
+ \
+ __i = (i); \
+ inode_check_scale(__i, __i->i_nlink, __i->i_nlink - 1); \
+ drop_nlink(__i); \
+})
+
+#define INODE_CLEAR_NLINK(i) \
+ ({ \
+ struct inode *__i; \
+ \
+ __i = (i); \
+ inode_check_scale(__i, __i->i_nlink, 0); \
+ clear_nlink(__i); \
+})
+
+
+static inline void inode_add_blocks(struct inode *inode, __u64 blocks)
+{
+ inode_add_bytes(inode, blocks << inode->i_blkbits);
+}
+
+static inline void inode_sub_blocks(struct inode *inode, __u64 blocks)
+{
+ inode_sub_bytes(inode, blocks << inode->i_blkbits);
+}
+
+
+/* See comment before reiser4_readdir_common() for description. */
+static inline struct list_head *get_readdir_list(const struct inode *inode)
+{
+ return &reiser4_inode_data(inode)->lists.readdir_list;
+}
+
+extern void init_inode_ordering(struct inode *inode,
+ reiser4_object_create_data *crd,
+ const reiser4_key *sd_key, int create);
+
+static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
+{
+ return &reiser4_inode_data(inode)->jnodes_tree;
+}
+
+static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
+ *r4_inode)
+{
+ return &r4_inode->jnodes_tree;
+}
+
+#if REISER4_DEBUG
+extern void print_inode(const char *prefix, const struct inode *i);
+#endif
+
+int is_dir_empty(const struct inode *);
+
+/* __REISER4_INODE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/ioctl.h linux-5.10.2/fs/reiser4/ioctl.h
--- linux-5.10.2.orig/fs/reiser4/ioctl.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/ioctl.h 2020-12-23 16:07:46.118813129 +0100
@@ -0,0 +1,187 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#if !defined(__REISER4_IOCTL_H__)
+#define __REISER4_IOCTL_H__
+
+#include <linux/fs.h>
+
+/*
+ * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
+ * extents and fix in this state. This is used by applications that rely on
+ *
+ * . files being block aligned, and
+ *
+ * . files never migrating on disk
+ *
+ * for example, boot loaders (LILO) need this.
+ *
+ * This ioctl should be used as
+ *
+ * result = ioctl(fd, REISER4_IOC_UNPACK);
+ *
+ * File behind fd descriptor will be converted to the extents (if necessary),
+ * and its stat-data will be updated so that it will never be converted back
+ * into tails again.
+ */
+
+/*
+ * On-line per-volume and per-subvolume flags.
+ * They are set up for (re)mount session and are not stored on disk
+ */
+typedef enum {
+ /*
+ * True if this file system doesn't support hard-links (multiple names)
+ * for directories: this is default UNIX behavior.
+ *
+ * If hard-links on directoires are not allowed, file system is Acyclic
+ * Directed Graph (modulo dot, and dotdot, of course).
+ *
+ * This is used by reiser4_link().
+ */
+ REISER4_ADG = 0,
+ /* if set, bsd gid assignment is supported. */
+ REISER4_BSD_GID = 2,
+ /* [mac]_time are 32 bit in inode */
+ REISER4_32_BIT_TIMES = 3,
+ /* load all bitmap blocks at mount time */
+ REISER4_DONT_LOAD_BITMAP = 5,
+ /* enforce atomicity during write(2) */
+ REISER4_ATOMIC_WRITE = 6,
+ /* enable issuing of discard requests */
+ REISER4_DISCARD = 8,
+ /* disable hole punching at flush time */
+ REISER4_DONT_PUNCH_HOLES = 9,
+ /* volume is ready for regular operations */
+ REISER4_ACTIVATED_VOL = 10,
+ /* volume is in unbalanced state */
+ REISER4_UNBALANCED_VOL = 12,
+ /* this flag indicates that volume operation was
+ interrupted for some reasons (e.g. system crash),
+ and should be completed in some context */
+ REISER4_INCOMPLETE_BRICK_REMOVAL = 13,
+ /* proxy-subvolume is active */
+ REISER4_PROXY_ENABLED = 15,
+ /* proxy subvolume accepts IO requests */
+ REISER4_PROXY_IO = 16
+} reiser4_fs_flag;
+
+typedef enum {
+ /* set if all nodes in internal tree have the same
+ * node layout plugin. See znode_guess_plugin() */
+ SUBVOL_ONE_NODE_PLUGIN = 0,
+ /* set if subvolume lives on a solid state drive */
+ SUBVOL_IS_NONROT_DEVICE = 1,
+ /* set if subvol is registered */
+ SUBVOL_REGISTERED = 2,
+ /* set if subvol is activated */
+ SUBVOL_ACTIVATED = 3,
+ /* set if brick is used for data storage and participates
+ in regular data distribution */
+ SUBVOL_HAS_DATA_ROOM = 4,
+ /* set if subvolume is not included in volume configuration
+ and doesn't accept any IOs */
+ SUBVOL_IS_ORPHAN = 5,
+ /* set if brick was scheduled for removal. It may be not
+ empty and may accept IOs */
+ SUBVOL_TO_BE_REMOVED = 6,
+ /* set if brick is used for data storage, but doesn't
+ participate in regular data distribution */
+ SUBVOL_IS_PROXY = 7
+} reiser4_subvol_flag;
+
+#define REISER4_PATH_NAME_MAX 3900 /* FIXME: make it more precise */
+
+typedef enum {
+ REISER4_INVALID_OPT,
+ REISER4_REGISTER_BRICK,
+ REISER4_UNREGISTER_BRICK,
+ REISER4_LIST_BRICKS,
+ REISER4_VOLUME_HEADER,
+ REISER4_BRICK_HEADER,
+ REISER4_PRINT_VOLUME,
+ REISER4_PRINT_BRICK,
+ REISER4_RESIZE_BRICK,
+ REISER4_ADD_BRICK,
+ REISER4_REMOVE_BRICK,
+ REISER4_SCALE_VOLUME,
+ REISER4_BALANCE_VOLUME,
+ REISER4_ADD_PROXY,
+ REISER4_MIGRATE_FILE,
+ REISER4_SET_FILE_IMMOBILE,
+ REISER4_CLR_FILE_IMMOBILE,
+ REISER4_FINISH_REMOVAL,
+ REISER4_RESTORE_REGULAR_DST
+} reiser4_vol_op;
+
+typedef enum {
+ COMPLETE_WITH_BALANCE = 0x1
+} reiser4_vol_op_flags;
+
+struct reiser4_volume_stat
+{
+ u8 id[16]; /* unique ID */
+ u32 nr_bricks; /* total number of bricks in the volume */
+ u32 bricks_in_dsa; /* number of bricks in DSA */
+ u16 vpid; /* volume plugin ID */
+ u16 dpid; /* distribution plugin ID */
+ u16 stripe_bits; /* logarithm of stripe size */
+ u16 nr_sgs_bits; /* logarithm of number of hash space segments */
+ u64 fs_flags; /* the same as the one of private super-block */
+ u32 nr_mslots; /* number of slots */
+ u32 nr_volinfo_blocks; /* Total number of blocks in the set
+ where volume configuration is stored */
+};
+
+struct reiser4_brick_stat
+{
+ u64 int_id; /* ordered number, 0 means meta-data brick */
+ u8 ext_id[16]; /* external unique ID */
+ u16 nr_replicas; /* number of replicas */
+ u64 subv_flags; /* per-subvolume on-line flags */
+ u64 block_count; /* total number of blocks on the device */
+ u64 data_capacity; /* "weight" of the brick in data storage array */
+ u64 blocks_used; /* number of blocks used by data and meta-data */
+ u64 system_blocks; /* minimal number of blocks, which are occupied by
+ system data (super-blocks, bitmap blocks, etc) */
+ u64 volinfo_addr; /* disk address of the first block of a portion
+ of volume configuration stored on this brick */
+};
+
+struct reiser4_vol_op_args
+{
+ reiser4_vol_op opcode;
+ int error;
+ u64 new_capacity;
+ u64 flags;
+ union {
+ u64 brick_idx; /* index of brick in logical volume */
+ u64 vol_idx; /* serial num of volume in the list of volumes */
+ u64 val;
+ }s;
+ union {
+ char name[REISER4_PATH_NAME_MAX + 1];
+ }d;
+ struct {
+ struct reiser4_volume_stat vol;
+ struct reiser4_brick_stat brick;
+ }u;
+};
+
+#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long)
+#define REISER4_IOC_VOLUME _IOWR(0xCD, 2, struct reiser4_vol_op_args)
+#define REISER4_IOC_SCAN_DEV _IOWR(0xCD, 3, struct reiser4_vol_op_args)
+
+/* __REISER4_IOCTL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/jnode.c linux-5.10.2/fs/reiser4/jnode.c
--- linux-5.10.2.orig/fs/reiser4/jnode.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/jnode.c 2020-12-23 16:07:46.118813129 +0100
@@ -0,0 +1,1969 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+/* Jnode manipulation functions. */
+/* Jnode is entity used to track blocks with data and meta-data in reiser4.
+
+ In particular, jnodes are used to track transactional information
+ associated with each block. Each znode contains jnode as ->zjnode field.
+
+ Jnode stands for either Josh or Journal node.
+*/
+
+/*
+ * Taxonomy.
+ *
+ * Jnode represents block containing data or meta-data. There are jnodes
+ * for:
+ *
+ * unformatted blocks (jnodes proper). There are plans, however to
+ * have a handle per extent unit rather than per each unformatted
+ * block, because there are so many of them.
+ *
+ * For bitmaps. Each bitmap is actually represented by two jnodes--one
+ * for working and another for "commit" data, together forming bnode.
+ *
+ * For io-heads. These are used by log writer.
+ *
+ * For formatted nodes (znode). See comment at the top of znode.c for
+ * details specific to the formatted nodes (znodes).
+ *
+ * Node data.
+ *
+ * Jnode provides access to the data of node it represents. Data are
+ * stored in a page. Page is kept in a page cache. This means, that jnodes
+ * are highly interconnected with page cache and VM internals.
+ *
+ * jnode has a pointer to page (->pg) containing its data. Pointer to data
+ * themselves is cached in ->data field to avoid frequent calls to
+ * page_address().
+ *
+ * jnode and page are attached to each other by jnode_attach_page(). This
+ * function places pointer to jnode in set_page_private(), sets PG_private
+ * flag and increments page counter.
+ *
+ * Opposite operation is performed by page_clear_jnode().
+ *
+ * jnode->pg is protected by jnode spin lock, and page->private is
+ * protected by page lock. See comment at the top of page_cache.c for
+ * more.
+ *
+ * page can be detached from jnode for two reasons:
+ *
+ * . jnode is removed from a tree (file is truncated, of formatted
+ * node is removed by balancing).
+ *
+ * . during memory pressure, VM calls ->releasepage() method
+ * (reiser4_releasepage()) to evict page from memory.
+ *
+ * (there, of course, is also umount, but this is special case we are not
+ * concerned with here).
+ *
+ * To protect jnode page from eviction, one calls jload() function that
+ * "pins" page in memory (loading it if necessary), increments
+ * jnode->d_count, and kmap()s page. Page is unpinned through call to
+ * jrelse().
+ *
+ * Jnode life cycle.
+ *
+ * jnode is created, placed in hash table, and, optionally, in per-inode
+ * radix tree. Page can be attached to jnode, pinned, released, etc.
+ *
+ * When jnode is captured into atom its reference counter is
+ * increased. While being part of an atom, jnode can be "early
+ * flushed". This means that as part of flush procedure, jnode is placed
+ * into "relocate set", and its page is submitted to the disk. After io
+ * completes, page can be detached, then loaded again, re-dirtied, etc.
+ *
+ * Thread acquired reference to jnode by calling jref() and releases it by
+ * jput(). When last reference is removed, jnode is still retained in
+ * memory (cached) if it has page attached, _unless_ it is scheduled for
+ * destruction (has JNODE_HEARD_BANSHEE bit set).
+ *
+ * Tree read-write lock was used as "existential" lock for jnodes. That is,
+ * jnode->x_count could be changed from 0 to 1 only under tree write lock,
+ * that is, tree lock protected unreferenced jnodes stored in the hash
+ * table, from recycling.
+ *
+ * This resulted in high contention on tree lock, because jref()/jput() is
+ * frequent operation. To ameliorate this problem, RCU is used: when jput()
+ * is just about to release last reference on jnode it sets JNODE_RIP bit
+ * on it, and then proceed with jnode destruction (removing jnode from hash
+ * table, cbk_cache, detaching page, etc.). All places that change jnode
+ * reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
+ * cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
+ * jnode_rip_check() function), and pretend that nothing was found in hash
+ * table if bit is set.
+ *
+ * jput defers actual return of jnode into slab cache to some later time
+ * (by call_rcu()), this guarantees that other threads can safely continue
+ * working with JNODE_RIP-ped jnode.
+ *
+ */
+
+#include "reiser4.h"
+#include "debug.h"
+#include "dformat.h"
+#include "jnode.h"
+#include "plugin/plugin_header.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+/*#include "jnode.h"*/
+#include "znode.h"
+#include "tree.h"
+#include "tree_walk.h"
+#include "super.h"
+#include "inode.h"
+#include "page_cache.h"
+
+#include <asm/uaccess.h> /* UML needs this for PAGE_OFFSET */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/fs.h> /* for struct address_space */
+#include <linux/writeback.h> /* for inode_wb_list_lock */
+
+static struct kmem_cache *_jnode_slab = NULL;
+
+static void jnode_set_type(jnode *node, jnode_type type);
+static int jdelete(jnode *node);
+static int jnode_try_drop(jnode *node);
+static int jnode_start_read(jnode *node, struct page *page);
+
+#if REISER4_DEBUG
+static int jnode_invariant(jnode *node, int tlocked, int jlocked);
+#endif
+
+/* true if valid page is attached to jnode */
+static inline int jnode_is_parsed(jnode * node)
+{
+ return JF_ISSET(node, JNODE_PARSED);
+}
+
+/* hash table support */
+
+/* compare two jnode keys for equality. Used by hash-table macros */
+static inline int jnode_key_eq(const struct jnode_key *k1,
+ const struct jnode_key *k2)
+{
+ assert("nikita-2350", k1 != NULL);
+ assert("nikita-2351", k2 != NULL);
+
+ return (k1->index == k2->index && k1->objectid == k2->objectid);
+}
+
+/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
+static inline __u32 jnode_key_hashfn(j_hash_table * table,
+ const struct jnode_key *key)
+{
+ assert("nikita-2352", key != NULL);
+ assert("nikita-3346", IS_POW(table->_buckets));
+
+ /* yes, this is remarkable simply (where not stupid) hash function. */
+ return (key->objectid + key->index) & (table->_buckets - 1);
+}
+
+/* The hash table definition */
+#define KMALLOC(size) reiser4_vmalloc(size)
+#define KFREE(ptr, size) vfree(ptr)
+TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
+ jnode_key_hashfn, jnode_key_eq);
+#undef KFREE
+#undef KMALLOC
+
+int reiser4_jnodes_init(void)
+{
+ return j_hash_init(&get_current_super_private()->jhash_table, 16384);
+}
+
+/* call this to destroy jnode hash table. This is called during umount. */
+int reiser4_jnodes_done(void)
+{
+ j_hash_table *jtable;
+ jnode *node;
+ jnode *next;
+ /*
+ * Scan hash table and free all jnodes.
+ */
+ jtable = &get_current_super_private()->jhash_table;
+ if (jtable->_table) {
+ for_all_in_htable(jtable, j, node, next) {
+ assert("nikita-2361", !atomic_read(&node->x_count));
+ jdrop(node);
+ }
+ j_hash_done(&get_current_super_private()->jhash_table);
+ }
+ return 0;
+}
+
+/**
+ * init_jnodes - create jnode cache
+ *
+ * Initializes slab cache jnodes. It is part of reiser4 module initialization.
+ */
+int init_jnodes(void)
+{
+ assert("umka-168", _jnode_slab == NULL);
+
+ _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (_jnode_slab == NULL)
+ return RETERR(-ENOMEM);
+
+ return 0;
+}
+
+/**
+ * done_znodes - delete znode cache
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void done_jnodes(void)
+{
+ destroy_reiser4_cache(&_jnode_slab);
+}
+
+#if REISER4_DEBUG
+void jnode_init_tail(jnode *node)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_current_super_private();
+ spin_lock_irq(&sbinfo->all_guard);
+ list_add(&node->jnodes, &sbinfo->all_jnodes);
+ spin_unlock_irq(&sbinfo->all_guard);
+}
+#else
+#define jnode_init_tail(node) noop
+#endif /* REISER4_DEBUG */
+
+/* Initialize a jnode. */
+void jnode_init(jnode *node, reiser4_subvol *subv, jnode_type type)
+{
+ assert("edward-2398", is_in_reiser4_context());
+
+ memset(node, 0, sizeof(jnode));
+ ON_DEBUG(node->magic = JMAGIC);
+ jnode_set_type(node, type);
+ atomic_set(&node->d_count, 0);
+ atomic_set(&node->x_count, 0);
+ spin_lock_init(&node->guard);
+ spin_lock_init(&node->load);
+ node->atom = NULL;
+ node->subvol = subv;
+ node->super = reiser4_get_current_sb();
+ INIT_LIST_HEAD(&node->capture_link);
+ init_waitqueue_head(&node->wait_jload);
+ ASSIGN_NODE_LIST(node, NOT_CAPTURED);
+ jnode_init_tail(node);
+}
+
+#if REISER4_DEBUG
+/*
+ * Remove jnode from ->all_jnodes list.
+ */
+static void jnode_done(jnode *node)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_super_private(jnode_get_super(node));
+
+ spin_lock_irq(&sbinfo->all_guard);
+ assert("nikita-2422", !list_empty(&node->jnodes));
+ list_del_init(&node->jnodes);
+ spin_unlock_irq(&sbinfo->all_guard);
+}
+#endif
+
+/* return already existing jnode of page */
+jnode *jnode_by_page(struct page *pg)
+{
+ assert("nikita-2400", PageLocked(pg));
+ assert("nikita-2068", PagePrivate(pg));
+ assert("nikita-2067", jprivate(pg) != NULL);
+ return jprivate(pg);
+}
+
+/* exported functions to allocate/free jnode objects outside this file */
+jnode *jalloc(void)
+{
+ jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
+ return jal;
+}
+
+/* return jnode back to the slab allocator */
+inline void jfree(jnode * node)
+{
+ assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
+ NODE_LIST(node) == NOT_CAPTURED));
+ assert("nikita-3222", list_empty(&node->jnodes));
+ assert("nikita-3221", jnode_page(node) == NULL);
+
+ /* not yet phash_jnode_destroy(node); */
+
+ kmem_cache_free(_jnode_slab, node);
+}
+
+/*
+ * This function is supplied as RCU callback. It actually frees jnode when
+ * last reference to it is gone.
+ */
+static void jnode_free_actor(struct rcu_head *head)
+{
+ jnode *node;
+ jnode_type jtype;
+
+ node = container_of(head, jnode, rcu);
+ jtype = jnode_get_type(node);
+
+ ON_DEBUG(jnode_done(node));
+
+ switch (jtype) {
+ case JNODE_IO_HEAD:
+ case JNODE_BITMAP:
+ case JNODE_VOLINFO_HEAD:
+ case JNODE_UNFORMATTED_BLOCK:
+ jfree(node);
+ break;
+ case JNODE_FORMATTED_BLOCK:
+ zfree(JZNODE(node));
+ break;
+ default:
+ wrong_return_value("nikita-3197", "Wrong jnode type");
+ }
+}
+
+/*
+ * Free a jnode. Post a callback to be executed later through RCU when all
+ * references to @node are released.
+ */
+static inline void jnode_free(jnode * node, jnode_type jtype)
+{
+ /*assert("nikita-3219", list_empty(&node->rcu.list)); */
+ call_rcu(&node->rcu, jnode_free_actor);
+}
+
+/* allocate new unformatted jnode */
+static jnode *jnew_unformatted(void)
+{
+ jnode *jal;
+
+ jal = jalloc();
+ if (jal == NULL)
+ return NULL;
+
+ jnode_init(jal, NULL, JNODE_UNFORMATTED_BLOCK);
+ jal->key.j.mapping = NULL;
+ jal->key.j.index = (unsigned long)-1;
+ jal->key.j.objectid = 0;
+ return jal;
+}
+
+/* look for jnode with given mapping and offset within hash table */
+jnode *jlookup(oid_t objectid, unsigned long index)
+{
+ struct jnode_key jkey;
+ jnode *node;
+
+ jkey.objectid = objectid;
+ jkey.index = index;
+
+ /*
+ * hash table is _not_ protected by any lock during lookups. All we
+ * have to do is to disable preemption to keep RCU happy.
+ */
+
+ rcu_read_lock();
+ node = j_hash_find(&get_current_super_private()->jhash_table, &jkey);
+ if (node != NULL) {
+ /* protect @node from recycling */
+ jref(node);
+ assert("nikita-2955", jnode_invariant(node, 0, 0));
+ node = jnode_rip_check(node);
+ }
+ rcu_read_unlock();
+ return node;
+}
+
+/* per inode radix tree of jnodes is protected by tree's read write spin lock */
+static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
+{
+ assert("vs-1694", mapping->host != NULL);
+
+ return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
+}
+
+jnode *jfind(struct address_space *mapping, unsigned long index)
+{
+ reiser4_tree *tree;
+ jnode *node;
+
+ assert("vs-1694", mapping->host != NULL);
+ tree = meta_subvol_tree();
+
+ read_lock_tree();
+ node = jfind_nolock(mapping, index);
+ if (node != NULL)
+ jref(node);
+ read_unlock_tree();
+ return node;
+}
+
+static void inode_attach_jnode(jnode * node)
+{
+ struct inode *inode;
+ reiser4_inode *info;
+ struct radix_tree_root *rtree;
+
+ assert("zam-1043", node->key.j.mapping != NULL);
+ inode = node->key.j.mapping->host;
+ info = reiser4_inode_data(inode);
+ rtree = jnode_tree_by_reiser4_inode(info);
+ if (radix_tree_empty(rtree)) {
+ /* prevent inode from being pruned when it has jnodes attached
+ to it */
+ xa_lock_irq(&inode->i_data.i_pages);
+ inode->i_data.nrpages++;
+ xa_unlock_irq(&inode->i_data.i_pages);
+ }
+ assert("zam-1049",
+ equi(!radix_tree_empty(rtree), info->nr_jnodes != 0));
+ check_me("zam-1045",
+ !radix_tree_insert(rtree, node->key.j.index, node));
+ ON_DEBUG(info->nr_jnodes++);
+}
+
+static void inode_detach_jnode(jnode * node)
+{
+ struct inode *inode;
+ reiser4_inode *info;
+ struct radix_tree_root *rtree;
+
+ assert("zam-1044", node->key.j.mapping != NULL);
+ inode = node->key.j.mapping->host;
+ info = reiser4_inode_data(inode);
+ rtree = jnode_tree_by_reiser4_inode(info);
+
+ assert("zam-1051", info->nr_jnodes != 0);
+ assert("zam-1052", !radix_tree_empty(rtree));
+ ON_DEBUG(info->nr_jnodes--);
+
+ /* delete jnode from inode's radix tree of jnodes */
+ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
+ if (radix_tree_empty(rtree)) {
+ /* inode can be pruned now */
+ xa_lock_irq(&inode->i_data.i_pages);
+ inode->i_data.nrpages--;
+ xa_unlock_irq(&inode->i_data.i_pages);
+ }
+}
+
+/* put jnode into hash table (where they can be found by flush who does not know
+ mapping) and to inode's tree of jnodes (where they can be found (hopefully
+ faster) in places where mapping is known). Currently it is used by
+ fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
+ created */
+static void
+hash_unformatted_jnode(jnode * node, struct address_space *mapping,
+ unsigned long index)
+{
+ j_hash_table *jtable;
+
+ assert("vs-1446", jnode_is_unformatted(node));
+ assert("vs-1442", node->key.j.mapping == 0);
+ assert("vs-1443", node->key.j.objectid == 0);
+ assert("vs-1444", node->key.j.index == (unsigned long)-1);
+
+ node->key.j.mapping = mapping;
+ node->key.j.objectid = get_inode_oid(mapping->host);
+ node->key.j.index = index;
+
+ jtable = &get_current_super_private()->jhash_table;
+
+ /* race with some other thread inserting jnode into the hash table is
+ * impossible, because we keep the page lock. */
+ /*
+ * following assertion no longer holds because of RCU: it is possible
+ * jnode is in the hash table, but with JNODE_RIP bit set.
+ */
+ /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
+ j_hash_insert_rcu(jtable, node);
+ inode_attach_jnode(node);
+}
+
+static void unhash_unformatted_node_nolock(jnode *node)
+{
+ assert("vs-1683", node->key.j.mapping != NULL);
+ assert("vs-1684",
+ node->key.j.objectid ==
+ get_inode_oid(node->key.j.mapping->host));
+
+ /* remove jnode from hash-table */
+ j_hash_remove_rcu(&get_super_private(jnode_get_super(node))->jhash_table, node);
+ inode_detach_jnode(node);
+ node->key.j.mapping = NULL;
+ node->key.j.index = (unsigned long)-1;
+ node->key.j.objectid = 0;
+}
+
+/* remove jnode from hash table and from inode's tree of jnodes. This is used in
+ reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
+ reiser4_uncapture_jnode */
+void unhash_unformatted_jnode(jnode *node)
+{
+ assert("vs-1445", jnode_is_unformatted(node));
+
+ __write_lock_tree(get_super_private(jnode_get_super(node)));
+ unhash_unformatted_node_nolock(node);
+ __write_unlock_tree(get_super_private(jnode_get_super(node)));
+}
+
+/*
+ * search hash table for a jnode with given oid and index. If not found,
+ * allocate new jnode, insert it, and also insert into radix tree for the
+ * given inode/mapping.
+ */
+static jnode *find_get_jnode(struct address_space *mapping,
+ oid_t oid, unsigned long index)
+{
+ jnode *result;
+ jnode *shadow;
+ int preload;
+ reiser4_super_info_data *info;
+
+ info = get_super_private(mapping->host->i_sb);
+ result = jnew_unformatted();
+
+ if (unlikely(result == NULL))
+ return ERR_PTR(RETERR(-ENOMEM));
+
+ preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
+ if (preload != 0)
+ return ERR_PTR(preload);
+
+ __write_lock_tree(info);
+ shadow = jfind_nolock(mapping, index);
+ if (likely(shadow == NULL)) {
+ /* add new jnode to hash table and inode's radix tree of
+ * jnodes */
+ jref(result);
+ hash_unformatted_jnode(result, mapping, index);
+ } else {
+ /* jnode is found in inode's radix tree of jnodes */
+ jref(shadow);
+ jnode_free(result, JNODE_UNFORMATTED_BLOCK);
+ assert("vs-1498", shadow->key.j.mapping == mapping);
+ result = shadow;
+ }
+ __write_unlock_tree(info);
+
+ assert("nikita-2955",
+ ergo(result != NULL, jnode_invariant(result, 0, 0)));
+ radix_tree_preload_end();
+ return result;
+}
+
+/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
+ creates) jnode corresponding to page @pg. jnode is attached to page and
+ inserted into jnode hash-table. */
+jnode *do_jget(struct page *pg)
+{
+ /*
+ * There are two ways to create jnode: starting with pre-existing page
+ * and without page.
+ *
+ * When page already exists, jnode is created
+ * (jnode_of_page()->do_jget()) under page lock. This is done in
+ * ->writepage(), or when capturing anonymous page dirtied through
+ * mmap.
+ *
+ * Jnode without page is created by index_extent_jnode().
+ *
+ */
+
+ jnode *result;
+ oid_t oid = get_inode_oid(pg->mapping->host);
+
+ assert("umka-176", pg != NULL);
+ assert("nikita-2394", PageLocked(pg));
+
+ result = jprivate(pg);
+ if (likely(result != NULL))
+ return jref(result);
+
+ /* check hash-table first */
+ result = jfind(pg->mapping, pg->index);
+ if (unlikely(result != NULL)) {
+ spin_lock_jnode(result);
+ jnode_attach_page(result, pg);
+ spin_unlock_jnode(result);
+ result->key.j.mapping = pg->mapping;
+ return result;
+ }
+
+ /* since page is locked, jnode should be allocated with GFP_NOFS flag */
+ reiser4_ctx_gfp_mask_force(GFP_NOFS);
+ result = find_get_jnode(pg->mapping, oid, pg->index);
+ if (unlikely(IS_ERR(result)))
+ return result;
+ /* attach jnode to page */
+ spin_lock_jnode(result);
+ jnode_attach_page(result, pg);
+ spin_unlock_jnode(result);
+ return result;
+}
+
+/**
+ * return jnode for @pg, creating it if necessary.
+ *
+ * @for_data_io: true, if jnode is to be bound with a data page and
+ * to participate in IO;
+ */
+jnode *jnode_of_page(struct page *pg)
+{
+ jnode *result;
+
+ assert("nikita-2394", PageLocked(pg));
+
+ result = do_jget(pg);
+
+ if (REISER4_DEBUG && !IS_ERR(result)) {
+ assert("nikita-3210", result == jprivate(pg));
+ assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
+ if (jnode_is_unformatted(jprivate(pg))) {
+ assert("nikita-2364",
+ jprivate(pg)->key.j.index == pg->index);
+ assert("nikita-2367",
+ jprivate(pg)->key.j.mapping == pg->mapping);
+ assert("nikita-2365",
+ jprivate(pg)->key.j.objectid ==
+ get_inode_oid(pg->mapping->host));
+ assert("vs-1200",
+ jprivate(pg)->key.j.objectid ==
+ pg->mapping->host->i_ino);
+ assert("nikita-2356",
+ jnode_is_unformatted(jnode_by_page(pg)));
+ }
+ assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
+ }
+ return result;
+}
+
+/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
+ * page.*/
+void jnode_attach_page(jnode * node, struct page *pg)
+{
+ assert("nikita-2060", node != NULL);
+ assert("nikita-2061", pg != NULL);
+
+ assert("nikita-2050", jprivate(pg) == 0ul);
+ assert("nikita-2393", !PagePrivate(pg));
+ assert("vs-1741", node->pg == NULL);
+
+ assert("nikita-2396", PageLocked(pg));
+ assert_spin_locked(&(node->guard));
+
+ get_page(pg);
+ set_page_private(pg, (unsigned long)node);
+ node->pg = pg;
+ SetPagePrivate(pg);
+}
+
+/* Dual to jnode_attach_page: break a binding between page and jnode */
+void page_clear_jnode(struct page *page, jnode * node)
+{
+ assert("nikita-2425", PageLocked(page));
+ assert_spin_locked(&(node->guard));
+ assert("nikita-2428", PagePrivate(page));
+
+ assert("nikita-3551", !PageWriteback(page));
+
+ JF_CLR(node, JNODE_PARSED);
+ set_page_private(page, 0ul);
+ ClearPagePrivate(page);
+ node->pg = NULL;
+ put_page(page);
+}
+
+#if 0
+/* it is only used in one place to handle error */
+void
+page_detach_jnode(struct page *page, struct address_space *mapping,
+ unsigned long index)
+{
+ assert("nikita-2395", page != NULL);
+
+ lock_page(page);
+ if ((page->mapping == mapping) && (page->index == index)
+ && PagePrivate(page)) {
+ jnode *node;
+
+ node = jprivate(page);
+ spin_lock_jnode(node);
+ page_clear_jnode(page, node);
+ spin_unlock_jnode(node);
+ }
+ unlock_page(page);
+}
+#endif /* 0 */
+
+/* return @node page locked.
+
+ Locking ordering requires that one first takes page lock and afterwards
+ spin lock on node attached to this page. Sometimes it is necessary to go in
+ the opposite direction. This is done through standard trylock-and-release
+ loop.
+*/
+static struct page *jnode_lock_page(jnode * node)
+{
+ struct page *page;
+
+ assert("nikita-2052", node != NULL);
+ assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
+
+ while (1) {
+
+ spin_lock_jnode(node);
+ page = jnode_page(node);
+ if (page == NULL)
+ break;
+
+ /* no need to get_page( page ) here, because page cannot
+ be evicted from memory without detaching it from jnode and
+ this requires spin lock on jnode that we already hold.
+ */
+ if (trylock_page(page)) {
+ /* We won a lock on jnode page, proceed. */
+ break;
+ }
+
+ /* Page is locked by someone else. */
+ get_page(page);
+ spin_unlock_jnode(node);
+ wait_on_page_locked(page);
+ /* it is possible that page was detached from jnode and
+ returned to the free pool, or re-assigned while we were
+ waiting on locked bit. This will be rechecked on the next
+ loop iteration.
+ */
+ put_page(page);
+
+ /* try again */
+ }
+ return page;
+}
+
+static struct page *__jnode_get_page_locked(jnode * node, gfp_t gfp_flags);
+
+/**
+ * Load jnode's data into memory and parse it.
+ * In the case of IO errors (original device has died, etc), or if
+ * parsing failed for some reasons (bitrot, etc), restart IO against
+ * replica devices and parse the results.
+ *
+ * Pre- and post-conditions: @node is spin-locked
+ */
+static int __jload_gfp_failover(jnode *node,
+ gfp_t gfp_flags,
+ int do_kmap /* true if page should be
+ kmap-ped on success */)
+{
+ int ret = 0;
+ u32 mirr_id;
+ struct page *page;
+ int first_iter = 1;
+ reiser4_subvol *orig = jnode_get_subvol(node);
+
+ page = __jnode_get_page_locked(node, gfp_flags);
+ if (unlikely(IS_ERR(page)))
+ return PTR_ERR(page);
+
+ spin_unlock_jnode(node);
+
+ __for_each_mirror(orig, mirr_id) {
+ if (!first_iter) {
+ lock_page(page);
+ node->subvol = current_mirror(orig->id, mirr_id);
+ }
+ first_iter = 0;
+ ret = jnode_start_read(node, page);
+ if (unlikely(ret != 0))
+ break;
+ wait_on_page_locked(page);
+ if (unlikely(!PageUptodate(page))) {
+ warning("edward-1810", "Can't load block %llu on %s.",
+ *jnode_get_block(node),
+ node->subvol->name);
+ ret = RETERR(-EIO);
+ goto load_from_replica;
+ }
+ node->data = kmap(page);
+ ret = jnode_ops(node)->parse(node);
+ if (likely(ret == 0))
+ break;
+ ret = RETERR(-EIO);
+ warning("edward-1811", "Block %llu on %s looks corrupted.\n",
+ *jnode_get_block(node),
+ node->subvol->name);
+ ClearPageUptodate(page);
+ kunmap(page);
+ load_from_replica:
+ if (mirr_id < orig->num_replicas)
+ notice("edward-1812",
+ "Loading from replica device %s.",
+ current_mirror(orig->id,
+ mirr_id + 1)->name);
+ }
+ /*
+ * TODO: Correct "fixable" errors here (the case of failed parsing).
+ * That is, issue read IOs with correct content against devices with
+ * "problematic" blocks.
+ */
+ spin_lock_jnode(node);
+ /*
+ * set back the original subvolume
+ */
+ node->subvol = orig;
+ if (ret == 0 && !do_kmap)
+ kunmap(page);
+ return ret;
+}
+
+/**
+ * Check if someone has already started to load @node.
+ * If so, then wait for completion and return the result of that attempt
+ * of loading. Otherwise, try to load it by yourself.
+ */
+static int jload_gfp_failover(jnode *node, gfp_t gfp_flags, int do_kmap)
+{
+ int result;
+
+ assert("nikita-2466", node != NULL);
+
+ spin_lock_jnode(node);
+
+ if (JF_ISSET(node, JNODE_LOADING_IN_PROGRESS)) {
+ spin_unlock_jnode(node);
+ wait_event(node->wait_jload,
+ !JF_ISSET(node, JNODE_LOADING_IN_PROGRESS));
+ spin_lock_jnode(node);
+
+ if (likely(JF_ISSET(node, JNODE_PARSED)))
+ result = 0;
+ else {
+ BUG_ON(!JF_ISSET(node, JNODE_PARSING_FAILED));
+ result = RETERR(-EIO);
+ }
+ }
+ else if (JF_ISSET(node, JNODE_PARSED))
+ result = 0;
+ else if (JF_ISSET(node, JNODE_PARSING_FAILED))
+ result = RETERR(-EIO);
+ else {
+ JF_SET(node, JNODE_LOADING_IN_PROGRESS);
+ result = __jload_gfp_failover(node, gfp_flags, do_kmap);
+ if (likely(result == 0))
+ JF_SET(node, JNODE_PARSED);
+ else
+ JF_SET(node, JNODE_PARSING_FAILED);
+ JF_CLR(node, JNODE_LOADING_IN_PROGRESS);
+ wake_up(&node->wait_jload);
+ }
+
+ spin_unlock_jnode(node);
+ return result;
+}
+
+/**
+ * Lock a page attached to jnode, create and attach page to jnode
+ * if it had no one.
+ *
+ * Pre-condition: @node is spin-locked
+ */
+static struct page *__jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
+{
+ struct page *page = jnode_page(node);
+
+ if (page == NULL) {
+ spin_unlock_jnode(node);
+ page = find_or_create_page(jnode_get_mapping(node),
+ jnode_get_index(node), gfp_flags);
+ if (page == NULL) {
+ spin_lock_jnode(node);
+ return ERR_PTR(RETERR(-ENOMEM));
+ }
+ } else {
+ if (trylock_page(page))
+ return page;
+ get_page(page);
+ spin_unlock_jnode(node);
+ lock_page(page);
+ assert("nikita-3134", page->mapping == jnode_get_mapping(node));
+ }
+ spin_lock_jnode(node);
+ if (!jnode_page(node))
+ jnode_attach_page(node, page);
+ put_page(page);
+ assert("zam-894", jnode_page(node) == page);
+ return page;
+}
+
+static struct page *jnode_get_page_locked(jnode *node, gfp_t gfp_flags)
+{
+ struct page *page;
+
+ spin_lock_jnode(node);
+ page = __jnode_get_page_locked(node, gfp_flags);
+ spin_unlock_jnode(node);
+ return page;
+}
+
+/* Start read operation for jnode's page if page is not up-to-date. */
+static int jnode_start_read(jnode * node, struct page *page)
+{
+ assert("zam-893", PageLocked(page));
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return 0;
+ }
+ return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
+}
+
+#if REISER4_DEBUG
+static void check_jload(jnode * node, struct page *page)
+{
+ if (jnode_is_znode(node)) {
+ znode *z = JZNODE(node);
+
+ if (znode_is_any_locked(z)) {
+ assert("nikita-3253",
+ z->nr_items ==
+ node_plugin_by_node(z)->num_of_items(z));
+ kunmap(page);
+ }
+ assert("nikita-3565", znode_invariant(z));
+ }
+}
+#else
+#define check_jload(node, page) noop
+#endif
+
+/**
+ * prefetch jnode to speed up next call to jload. Call this when you are going
+ * to call jload() shortly. This will bring appropriate portion of jnode into
+ * CPU cache
+ */
+void jload_prefetch(jnode * node)
+{
+ prefetchw(&node->x_count);
+}
+
+/**
+ * Load jnode's data into memory
+ */
+int jload_gfp(jnode *node /* node to load */ ,
+ gfp_t gfp_flags /* allocation flags */ ,
+ int do_kmap /* true if page should be kmapped */)
+{
+ struct page *page;
+ int result = 0;
+ int parsed;
+
+ assert("nikita-3010", reiser4_schedulable());
+
+ prefetchw(&node->pg);
+ /*
+ * taking d-reference implies taking x-reference
+ */
+ jref(node);
+ /*
+ * acquiring d-reference to @jnode and check for JNODE_PARSED bit
+ * should be atomic, otherwise there is a race against
+ * reiser4_releasepage().
+ */
+ spin_lock(&(node->load));
+ add_d_ref(node);
+ parsed = jnode_is_parsed(node);
+ spin_unlock(&(node->load));
+
+ if (unlikely(!parsed)) {
+ result = jload_gfp_failover(node, gfp_flags, do_kmap);
+ if (unlikely(result != 0))
+ goto failed;
+ page = jnode_page(node);
+ } else {
+ page = jnode_page(node);
+ if (do_kmap)
+ node->data = kmap(page);
+ }
+ check_jload(node, page);
+
+ if (!is_writeout_mode())
+ /*
+ * We do not mark pages active if jload is called as a part of
+ * jnode_flush() or reiser4_write_logs(). Both jnode_flush()
+ * and write_logs() add no value to cached data, there is no
+ * sense to mark pages as active when they go to disk, it just
+ * confuses vm scanning routines because clean page could be
+ * moved out from inactive list as a result of this
+ * mark_page_accessed() call.
+ */
+ mark_page_accessed(page);
+ return 0;
+failed:
+ jrelse_tail(node);
+ return result;
+
+}
+
+/* start asynchronous reading for given jnode's page. */
+int jstartio(jnode * node)
+{
+ struct page *page;
+
+ page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ return jnode_start_read(node, page);
+}
+
+/* Initialize a node by calling appropriate plugin instead of reading
+ * node from disk as in jload(). */
+int jinit_new(jnode * node, gfp_t gfp_flags)
+{
+ struct page *page;
+ int result;
+
+ jref(node);
+ add_d_ref(node);
+
+ page = jnode_get_page_locked(node, gfp_flags);
+ if (IS_ERR(page)) {
+ result = PTR_ERR(page);
+ goto failed;
+ }
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ node->data = kmap(page);
+
+ if (!jnode_is_parsed(node)) {
+ jnode_plugin *jplug = jnode_ops(node);
+ assert("edward-1973", jplug != NULL);
+ assert("edward-1974", jplug->init != NULL);
+
+ spin_lock_jnode(node);
+ result = jplug->init(node);
+ spin_unlock_jnode(node);
+ if (result) {
+ kunmap(page);
+ goto failed;
+ }
+ JF_SET(node, JNODE_PARSED);
+ }
+ return 0;
+failed:
+ jrelse(node);
+ return result;
+}
+
+/* release a reference to jnode acquired by jload(), decrement ->d_count */
+void jrelse_tail(jnode * node/* jnode to release references to */)
+{
+ assert("nikita-489", atomic_read(&node->d_count) > 0);
+ atomic_dec(&node->d_count);
+ /* release reference acquired in jload_gfp() or jinit_new() */
+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
+ LOCK_CNT_DEC(d_refs);
+ jput(node);
+}
+
+/* drop reference to node data. When last reference is dropped, data are
+ unloaded. */
+void jrelse(jnode * node/* jnode to release references to */)
+{
+ struct page *page;
+
+ assert("nikita-487", node != NULL);
+ assert_spin_not_locked(&(node->guard));
+
+ page = jnode_page(node);
+ if (likely(page != NULL)) {
+ /*
+ * it is safe not to lock jnode here, because at this point
+ * @node->d_count is greater than zero (if jrelse() is used
+ * correctly, that is). JNODE_PARSED may be not set yet, if,
+ * for example, we got here as a result of error handling path
+ * in jload(). Anyway, page cannot be detached by
+ * reiser4_releasepage(). truncate will invalidate page
+ * regardless, but this should not be a problem.
+ */
+ kunmap(page);
+ }
+ jrelse_tail(node);
+}
+
+/* called from jput() to wait for io completion */
+static void jnode_finish_io(jnode * node)
+{
+ struct page *page;
+
+ assert("nikita-2922", node != NULL);
+
+ spin_lock_jnode(node);
+ page = jnode_page(node);
+ if (page != NULL) {
+ get_page(page);
+ spin_unlock_jnode(node);
+ wait_on_page_writeback(page);
+ put_page(page);
+ } else
+ spin_unlock_jnode(node);
+}
+
+/*
+ * This is called by jput() when last reference to jnode is released. This is
+ * separate function, because we want fast path of jput() to be inline and,
+ * therefore, small.
+ */
+void jput_final(jnode *node)
+{
+ int r_i_p;
+
+ /* A fast check for keeping node in cache. We always keep node in cache
+ * if its page is present and node was not marked for deletion */
+ if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
+ rcu_read_unlock();
+ return;
+ }
+ r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
+ /*
+ * if r_i_p is true, we were first to set JNODE_RIP on this node. In
+ * this case it is safe to access node after unlock.
+ */
+ rcu_read_unlock();
+ if (r_i_p) {
+ jnode_finish_io(node);
+ if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
+ /* node is removed from the tree. */
+ jdelete(node);
+ else
+ jnode_try_drop(node);
+ }
+ /* if !r_i_p some other thread is already killing it */
+}
+
+int jwait_io(jnode * node, int rw)
+{
+ struct page *page;
+ int result;
+
+ assert("zam-448", jnode_page(node) != NULL);
+
+ page = jnode_page(node);
+
+ result = 0;
+ if (rw == READ) {
+ wait_on_page_locked(page);
+ } else {
+ assert("nikita-2227", rw == WRITE);
+ wait_on_page_writeback(page);
+ }
+ if (PageError(page))
+ result = RETERR(-EIO);
+
+ return result;
+}
+
+/*
+ * jnode types and plugins.
+ *
+ * jnode by itself is a "base type". There are several different jnode
+ * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
+ * has to do different things based on jnode type. In the standard reiser4 way
+ * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
+ *
+ * Functions below deal with jnode types and define methods of jnode plugin.
+ *
+ */
+
+/* set jnode type. This is done during jnode initialization. */
+static void jnode_set_type(jnode * node, jnode_type type)
+{
+ static unsigned long type_to_mask[] = {
+ [JNODE_UNFORMATTED_BLOCK] = 1,
+ [JNODE_FORMATTED_BLOCK] = 0,
+ [JNODE_BITMAP] = 2,
+ [JNODE_IO_HEAD] = 6,
+ [JNODE_VOLINFO_HEAD] = 4
+ };
+
+ assert("zam-647", type < LAST_JNODE_TYPE);
+ assert("nikita-2815", !jnode_is_loaded(node));
+ assert("nikita-3386", node->state == 0);
+
+ node->state |= (type_to_mask[type] << JNODE_TYPE_1);
+}
+
+/* ->init() method of jnode plugin for jnodes that don't require plugin
+ * specific initialization. */
+static int init_noinit(jnode * node UNUSED_ARG)
+{
+ return 0;
+}
+
+/* ->parse() method of jnode plugin for jnodes that don't require plugin
+ * specific pasring. */
+static int parse_noparse(jnode * node UNUSED_ARG)
+{
+ return 0;
+}
+
+/* ->mapping() method for unformatted jnode */
+struct address_space *mapping_jnode(const jnode * node)
+{
+ struct address_space *map;
+
+ assert("nikita-2713", node != NULL);
+
+ /* mapping is stored in jnode */
+
+ map = node->key.j.mapping;
+ assert("nikita-2714", map != NULL);
+ assert("nikita-2897", is_reiser4_inode(map->host));
+ assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
+ return map;
+}
+
+/* ->index() method for unformatted jnodes */
+unsigned long index_jnode(const jnode * node)
+{
+ /* index is stored in jnode */
+ return node->key.j.index;
+}
+
+/* ->remove() method for unformatted jnodes */
+static inline void remove_jnode(jnode *node)
+{
+ /* remove jnode from hash table and radix tree */
+ if (node->key.j.mapping)
+ unhash_unformatted_node_nolock(node);
+}
+
+/* ->mapping() method for znodes */
+static struct address_space *mapping_znode(const jnode * node)
+{
+ /* all znodes belong to fake inode */
+ return reiser4_get_super_fake(jnode_get_super(node))->i_mapping;
+}
+
+/* ->index() method for znodes */
+static unsigned long index_znode(const jnode * node)
+{
+ unsigned long addr;
+ assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
+
+ /* index of znode is just its address (shifted) */
+ addr = (unsigned long)node;
+ return (addr - PAGE_OFFSET) >> znode_shift_order;
+}
+
+/* ->mapping() method for bitmap jnode */
+static struct address_space *mapping_bitmap(const jnode * node)
+{
+ /* all bitmap blocks belong to special bitmap inode */
+ return get_super_private(jnode_get_super(node))->bitmap->i_mapping;
+}
+
+/* ->index() method for jnodes that are indexed by address */
+static unsigned long index_is_address(const jnode * node)
+{
+ unsigned long ind;
+
+ ind = (unsigned long)node;
+ return ind - PAGE_OFFSET;
+}
+
+/* resolve race with jput */
+jnode *jnode_rip_sync(jnode *node)
+{
+ /*
+ * This is used as part of RCU-based jnode handling.
+ *
+ * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
+ * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
+ * not protected during this, so concurrent thread may execute
+ * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
+ * freed in jput_final(). To avoid such races, jput_final() sets
+ * JNODE_RIP on jnode (under tree lock). All places that work with
+ * unreferenced jnodes call this function. It checks for JNODE_RIP bit
+ * (first without taking tree lock), and if this bit is set, released
+ * reference acquired by the current thread and returns NULL.
+ *
+ * As a result, if jnode is being concurrently freed, NULL is returned
+ * and caller should pretend that jnode wasn't found in the first
+ * place.
+ *
+ * Otherwise it's safe to release "rcu-read-lock" and continue with
+ * jnode.
+ */
+ if (unlikely(JF_ISSET(node, JNODE_RIP))) {
+ read_lock_tree();
+ if (JF_ISSET(node, JNODE_RIP)) {
+ dec_x_ref(node);
+ node = NULL;
+ }
+ read_unlock_tree();
+ }
+ return node;
+}
+
+reiser4_key *jnode_build_key(const jnode *node, reiser4_key *key)
+{
+ loff_t off;
+ struct inode *inode;
+ file_plugin *fplug;
+
+ assert("nikita-3092", node != NULL);
+ assert("nikita-3093", key != NULL);
+ assert("nikita-3094", jnode_is_unformatted(node));
+
+ off = ((loff_t) index_jnode(node)) << PAGE_SHIFT;
+ inode = mapping_jnode(node)->host;
+ fplug = inode_file_plugin(inode);
+
+ assert("zam-1007", fplug != NULL);
+ assert("zam-1008", fplug->build_body_key != NULL);
+
+ fplug->build_body_key(inode, off, key);
+ return key;
+}
+
+/* ->parse() method for formatted nodes */
+static int parse_znode(jnode * node)
+{
+ return zparse(JZNODE(node));
+}
+
+/* ->delete() method for formatted nodes */
+static void delete_znode(jnode *node)
+{
+ znode *z;
+#if REISER4_DEBUG
+ reiser4_super_info_data *sbinfo;
+
+ assert("edward-2023", jnode_get_subvol(node) != NULL);
+
+ sbinfo = get_super_private(jnode_get_super(node));
+
+ assert_rw_write_locked(&(sbinfo->tree_lock));
+#endif
+ assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
+
+ z = JZNODE(node);
+ assert("vs-899", z->c_count == 0);
+
+ /* delete znode from sibling list. */
+ sibling_list_remove(z);
+ znode_remove(z);
+}
+
+/*
+ * ->remove() method for formatted nodes
+ */
+static int remove_znode(jnode *node)
+{
+ znode *z;
+#if REISER4_DEBUG
+ reiser4_super_info_data *sbinfo;
+
+ assert("edward-2024", jnode_get_subvol(node) != NULL);
+
+ sbinfo = get_super_private(jnode_get_super(node));
+ assert_rw_write_locked(&(sbinfo->tree_lock));
+#endif
+ z = JZNODE(node);
+
+ if (z->c_count == 0) {
+ /* detach znode from sibling list. */
+ sibling_list_drop(z);
+ /* this is called with tree spin-lock held, so call
+ znode_remove() directly (rather than znode_lock_remove()). */
+ znode_remove(z);
+ return 0;
+ }
+ return RETERR(-EBUSY);
+}
+
+/* ->init() method for formatted nodes */
+int init_znode(jnode * node)
+{
+ znode *z;
+
+ z = JZNODE(node);
+ /* call node plugin to do actual initialization */
+ z->nr_items = 0;
+ return z->nplug->init(z);
+}
+
+/*
+ * Setup jnode plugin methods for various jnode types.
+ */
+jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
+ [JNODE_UNFORMATTED_BLOCK] = {
+ .h = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .id = JNODE_UNFORMATTED_BLOCK,
+ .pops = NULL,
+ .label = "unformatted",
+ .desc = "unformatted node",
+ .linkage = {NULL, NULL}
+ },
+ .init = init_noinit,
+ .parse = parse_noparse,
+ .mapping = mapping_jnode,
+ .index = index_jnode
+ },
+ [JNODE_FORMATTED_BLOCK] = {
+ .h = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .id = JNODE_FORMATTED_BLOCK,
+ .pops = NULL,
+ .label = "formatted",
+ .desc = "formatted tree node",
+ .linkage = {NULL, NULL}
+ },
+ .init = init_znode,
+ .parse = parse_znode,
+ .mapping = mapping_znode,
+ .index = index_znode
+ },
+ [JNODE_BITMAP] = {
+ .h = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .id = JNODE_BITMAP,
+ .pops = NULL,
+ .label = "bitmap",
+ .desc = "bitmap node",
+ .linkage = {NULL, NULL}
+ },
+ .init = init_noinit,
+ .parse = parse_noparse,
+ .mapping = mapping_bitmap,
+ .index = index_is_address
+ },
+ [JNODE_IO_HEAD] = {
+ .h = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .id = JNODE_IO_HEAD,
+ .pops = NULL,
+ .label = "io head",
+ .desc = "io head",
+ .linkage = {NULL, NULL}
+ },
+ .init = init_noinit,
+ .parse = parse_noparse,
+ .mapping = mapping_bitmap,
+ .index = index_is_address
+ },
+ [JNODE_VOLINFO_HEAD] = {
+ .h = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .id = JNODE_VOLINFO_HEAD,
+ .pops = NULL,
+ .label = "volinfo",
+ .desc = "volinfo head",
+ .linkage = {NULL, NULL}
+ },
+ .init = init_noinit,
+ .parse = parse_noparse,
+ .mapping = mapping_bitmap,
+ .index = index_is_address
+ }
+};
+
+/*
+ * jnode destruction.
+ *
+ * Thread may use a jnode after it acquired a reference to it. References are
+ * counted in ->x_count field. Reference protects jnode from being
+ * recycled. This is different from protecting jnode data (that are stored in
+ * jnode page) from being evicted from memory. Data are protected by jload()
+ * and released by jrelse().
+ *
+ * If thread already possesses a reference to the jnode it can acquire another
+ * one through jref(). Initial reference is obtained (usually) by locating
+ * jnode in some indexing structure that depends on jnode type: formatted
+ * nodes are kept in global hash table, where they are indexed by block
+ * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
+ * table, which is indexed by oid and offset within file, and in per-inode
+ * radix tree.
+ *
+ * Reference to jnode is released by jput(). If last reference is released,
+ * jput_final() is called. This function determines whether jnode has to be
+ * deleted (this happens when corresponding node is removed from the file
+ * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
+ * should be just "removed" (deleted from memory).
+ *
+ * Jnode destruction is signally delicate dance because of locking and RCU.
+ */
+
+/*
+ * Returns true if jnode cannot be removed right now. This check is called
+ * under tree lock. If it returns true, jnode is irrevocably committed to be
+ * deleted/removed.
+ */
+static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
+{
+ /* if other thread managed to acquire a reference to this jnode, don't
+ * free it. */
+ if (atomic_read(&node->x_count) > 0)
+ return 1;
+ /* also, don't free znode that has children in memory */
+ if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * this is called as part of removing jnode. Based on jnode type, call
+ * corresponding function that removes jnode from indices and returns it back
+ * to the appropriate slab (through RCU).
+ */
+static inline void jnode_remove(jnode *node, jnode_type jtype)
+{
+ switch (jtype) {
+ case JNODE_UNFORMATTED_BLOCK:
+ remove_jnode(node);
+ break;
+ case JNODE_IO_HEAD:
+ case JNODE_BITMAP:
+ break;
+ case JNODE_VOLINFO_HEAD:
+ break;
+ case JNODE_FORMATTED_BLOCK:
+ remove_znode(node);
+ break;
+ default:
+ wrong_return_value("nikita-3196", "Wrong jnode type");
+ }
+}
+
+/*
+ * this is called as part of deleting jnode. Based on jnode type, call
+ * corresponding function that removes jnode from indices and returns it back
+ * to the appropriate slab (through RCU).
+ *
+ * This differs from jnode_remove() only for formatted nodes---for them
+ * sibling list handling is different for removal and deletion.
+ */
+static inline void jnode_delete(jnode *node, jnode_type jtype)
+{
+ switch (jtype) {
+ case JNODE_UNFORMATTED_BLOCK:
+ remove_jnode(node);
+ break;
+ case JNODE_IO_HEAD:
+ case JNODE_BITMAP:
+ break;
+ case JNODE_FORMATTED_BLOCK:
+ delete_znode(node);
+ break;
+ case JNODE_VOLINFO_HEAD:
+ default:
+ wrong_return_value("nikita-3195", "Wrong jnode type");
+ }
+}
+
+#if REISER4_DEBUG
+/*
+ * remove jnode from the debugging list of all jnodes hanging off super-block.
+ */
+void jnode_list_remove(jnode * node)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_super_private(jnode_get_super(node));
+
+ spin_lock_irq(&sbinfo->all_guard);
+ assert("nikita-2422", !list_empty(&node->jnodes));
+ list_del_init(&node->jnodes);
+ spin_unlock_irq(&sbinfo->all_guard);
+}
+#endif
+
+/*
+ * this is called by jput_final() to remove jnode when last reference to it is
+ * released.
+ */
+static int jnode_try_drop(jnode *node)
+{
+ int result;
+ jnode_type jtype;
+ reiser4_super_info_data *sbinfo;
+
+ assert("nikita-2491", node != NULL);
+ assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
+
+ sbinfo = get_super_private(jnode_get_super(node));
+ jtype = jnode_get_type(node);
+
+ spin_lock_jnode(node);
+ __write_lock_tree(sbinfo);
+ /*
+ * if jnode has a page---leave it alone. Memory pressure will
+ * eventually kill page and jnode.
+ */
+ if (jnode_page(node) != NULL) {
+ __write_unlock_tree(sbinfo);
+ spin_unlock_jnode(node);
+ JF_CLR(node, JNODE_RIP);
+ return RETERR(-EBUSY);
+ }
+
+ /* re-check ->x_count under tree lock. */
+ result = jnode_is_busy(node, jtype);
+ if (result == 0) {
+ assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
+ assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
+
+ spin_unlock_jnode(node);
+ /* no page and no references---despatch him. */
+ jnode_remove(node, jtype);
+ __write_unlock_tree(sbinfo);
+ jnode_free(node, jtype);
+ } else {
+ /* busy check failed: reference was acquired by concurrent
+ * thread. */
+ __write_unlock_tree(sbinfo);
+ spin_unlock_jnode(node);
+ JF_CLR(node, JNODE_RIP);
+ }
+ return result;
+}
+
+/* jdelete() -- Delete jnode from the tree and file system */
+static int jdelete(jnode *node /* jnode to finish with */)
+{
+ struct page *page;
+ int result;
+ jnode_type jtype;
+ reiser4_super_info_data *info;
+
+ assert("nikita-467", node != NULL);
+ assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
+
+ jtype = jnode_get_type(node);
+
+ page = jnode_lock_page(node);
+ assert_spin_locked(&(node->guard));
+
+ info = get_super_private(jnode_get_super(node));
+
+ __write_lock_tree(info);
+ /* re-check ->x_count under tree lock. */
+ result = jnode_is_busy(node, jtype);
+ if (likely(!result)) {
+ assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
+ assert("jmacd-511", atomic_read(&node->d_count) == 0);
+
+ /* detach page */
+ if (page != NULL) {
+ /*
+ * FIXME this is racy against jnode_extent_write().
+ */
+ page_clear_jnode(page, node);
+ }
+ spin_unlock_jnode(node);
+ /* goodbye */
+ jnode_delete(node, jtype);
+ __write_unlock_tree(info);
+ jnode_free(node, jtype);
+ /* @node is no longer valid pointer */
+ if (page != NULL)
+ reiser4_drop_page(page);
+ } else {
+ /* busy check failed: reference was acquired by concurrent
+ * thread. */
+ JF_CLR(node, JNODE_RIP);
+ __write_unlock_tree(info);
+ spin_unlock_jnode(node);
+ if (page != NULL)
+ unlock_page(page);
+ }
+ return result;
+}
+
+/**
+ * This function frees jnode "if possible".
+ * In particular, [dcx]_count has to be 0 (where applicable).
+ *
+ * @tree: the tree that jnode belongs to.
+ *
+ * Return value:
+ * -EBUSY: failed to drop jnode, because there are still references to it
+ * 0: successfully dropped jnode
+ */
+int jdrop(jnode *node)
+{
+ struct page *page;
+ jnode_type jtype;
+ int result;
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_super_private(jnode_get_super(node));
+
+ assert_rw_not_read_locked(&(sbinfo->tree_lock));
+ assert_rw_not_write_locked(&(sbinfo->tree_lock));
+ assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
+
+ jtype = jnode_get_type(node);
+
+ page = jnode_lock_page(node);
+ assert_spin_locked(&(node->guard));
+
+ __write_lock_tree(sbinfo);
+
+ /* re-check ->x_count under tree lock. */
+ result = jnode_is_busy(node, jtype);
+ if (!result) {
+ assert("nikita-2488", page == jnode_page(node));
+ assert("nikita-2533", atomic_read(&node->d_count) == 0);
+ if (page != NULL) {
+ assert("nikita-2126", !PageDirty(page));
+ assert("nikita-2127", PageUptodate(page));
+ assert("nikita-2181", PageLocked(page));
+ page_clear_jnode(page, node);
+ }
+ spin_unlock_jnode(node);
+ jnode_remove(node, jtype);
+ __write_unlock_tree(sbinfo);
+ jnode_free(node, jtype);
+ if (page != NULL)
+ reiser4_drop_page(page);
+ } else {
+ /* busy check failed: reference was acquired by concurrent
+ * thread. */
+ JF_CLR(node, JNODE_RIP);
+ __write_unlock_tree(sbinfo);
+ spin_unlock_jnode(node);
+ if (page != NULL)
+ unlock_page(page);
+ }
+ return result;
+}
+
+/* IO head jnode implementation; The io heads are simple j-nodes with limited
+ functionality (these j-nodes are not in any hash table) just for reading
+ from and writing to disk. */
+
+jnode *reiser4_alloc_io_head(const reiser4_block_nr *block,
+ reiser4_subvol *subv)
+{
+ jnode *jal = jalloc();
+
+ if (jal != NULL) {
+ jnode_init(jal, subv, JNODE_IO_HEAD);
+ jnode_set_block(jal, block);
+ }
+
+ jref(jal);
+
+ return jal;
+}
+
+void reiser4_drop_io_head(jnode * node)
+{
+ assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
+
+ jput(node);
+ jdrop(node);
+}
+
+jnode *reiser4_alloc_volinfo_head(const reiser4_block_nr *block,
+ reiser4_subvol *subv)
+{
+ jnode *jal = jalloc();
+
+ if (jal != NULL) {
+ jnode_init(jal, subv, JNODE_VOLINFO_HEAD);
+ jnode_set_block(jal, block);
+ }
+ jref(jal);
+
+ return jal;
+}
+
+void reiser4_drop_volinfo_head(jnode *node)
+{
+ assert("edward-1834",
+ jnode_get_type(node) == JNODE_VOLINFO_HEAD);
+
+ jput(node);
+ jdrop(node);
+}
+
+/* protect keep jnode data from reiser4_releasepage() */
+void pin_jnode_data(jnode * node)
+{
+ assert("zam-671", jnode_page(node) != NULL);
+ get_page(jnode_page(node));
+}
+
+/* make jnode data free-able again */
+void unpin_jnode_data(jnode * node)
+{
+ assert("zam-672", jnode_page(node) != NULL);
+ put_page(jnode_page(node));
+}
+
+struct address_space *jnode_get_mapping(const jnode * node)
+{
+ return jnode_ops(node)->mapping(node);
+}
+
+#if REISER4_DEBUG
+/* debugging aid: jnode invariant */
+int jnode_invariant_f(const jnode * node, char const **msg)
+{
+#define _ergo(ant, con) \
+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
+#define _check(exp) ((*msg) = #exp, (exp))
+
+ return _check(node != NULL) &&
+ /* [jnode-queued] */
+ /* only relocated node can be queued, except that when znode
+ * is being deleted, its JNODE_RELOC bit is cleared */
+ _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
+ JF_ISSET(node, JNODE_RELOC) ||
+ JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
+ _check(node->jnodes.prev != NULL) &&
+ _check(node->jnodes.next != NULL) &&
+ /* [jnode-dirty] invariant */
+ /* dirty inode is part of atom */
+ _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
+ /* [jnode-oid] invariant */
+ /* for unformatted node ->objectid and ->mapping fields are
+ * consistent */
+ _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
+ node->key.j.objectid ==
+ get_inode_oid(node->key.j.mapping->host)) &&
+ /* [jnode-atom-valid] invariant */
+ /* node atom has valid state */
+ _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
+ /* [jnode-page-binding] invariant */
+ /* if node points to page, it points back to node */
+ _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
+ /* [jnode-refs] invariant */
+ /* only referenced jnode can be loaded */
+ _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
+
+}
+
+static const char *jnode_type_name(jnode_type type)
+{
+ switch (type) {
+ case JNODE_UNFORMATTED_BLOCK:
+ return "unformatted";
+ case JNODE_FORMATTED_BLOCK:
+ return "formatted";
+ case JNODE_BITMAP:
+ return "bitmap";
+ case JNODE_IO_HEAD:
+ return "io head";
+ case JNODE_VOLINFO_HEAD:
+ return "volinfo";
+ case LAST_JNODE_TYPE:
+ return "last";
+ default:{
+ static char unknown[30];
+
+ sprintf(unknown, "unknown %i", type);
+ return unknown;
+ }
+ }
+}
+
+#define jnode_state_name(node, flag) \
+ (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "")
+
+/* debugging aid: output human readable information about @node */
+static void info_jnode(const char *prefix /* prefix to print */ ,
+ const jnode * node/* node to print */)
+{
+ assert("umka-068", prefix != NULL);
+
+ if (node == NULL) {
+ printk("%s: null\n", prefix);
+ return;
+ }
+
+ printk
+ ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
+ " block: %s, d_count: %d, x_count: %d, "
+ "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
+ node->state,
+ jnode_state_name(node, JNODE_PARSED),
+ jnode_state_name(node, JNODE_HEARD_BANSHEE),
+ jnode_state_name(node, JNODE_LEFT_CONNECTED),
+ jnode_state_name(node, JNODE_RIGHT_CONNECTED),
+ jnode_state_name(node, JNODE_ORPHAN),
+ jnode_state_name(node, JNODE_CREATED),
+ jnode_state_name(node, JNODE_RELOC),
+ jnode_state_name(node, JNODE_OVRWR),
+ jnode_state_name(node, JNODE_DIRTY),
+ jnode_state_name(node, JNODE_IS_DYING),
+ jnode_state_name(node, JNODE_RIP),
+ jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
+ jnode_state_name(node, JNODE_WRITEBACK),
+ jnode_state_name(node, JNODE_DKSET),
+ jnode_state_name(node, JNODE_REPACK),
+ jnode_state_name(node, JNODE_CLUSTER_PAGE),
+ jnode_get_level(node), sprint_address(jnode_get_block(node)),
+ atomic_read(&node->d_count), atomic_read(&node->x_count),
+ jnode_page(node), node->atom, 0, 0,
+ jnode_type_name(jnode_get_type(node)));
+ if (jnode_is_unformatted(node)) {
+ printk("inode: %llu, index: %lu, ",
+ node->key.j.objectid, node->key.j.index);
+ }
+}
+
+/* debugging aid: check znode invariant and panic if it doesn't hold */
+static int jnode_invariant(jnode *node, int tlocked, int jlocked)
+{
+ char const *failed_msg;
+ int result;
+
+ assert("umka-063312", node != NULL);
+
+ if (!jlocked && !tlocked)
+ spin_lock_jnode((jnode *) node);
+ if (!tlocked)
+ read_lock_tree();
+ result = jnode_invariant_f(node, &failed_msg);
+ if (!result) {
+ info_jnode("corrupted node", node);
+ warning("jmacd-555", "Condition %s failed", failed_msg);
+ }
+ if (!tlocked)
+ read_unlock_tree();
+ if (!jlocked && !tlocked)
+ spin_unlock_jnode((jnode *) node);
+ return result;
+}
+
+#endif /* REISER4_DEBUG */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/jnode.h linux-5.10.2/fs/reiser4/jnode.h
--- linux-5.10.2.orig/fs/reiser4/jnode.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/jnode.h 2020-12-23 16:07:46.118813129 +0100
@@ -0,0 +1,726 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Declaration of jnode. See jnode.c for details. */
+
+#ifndef __JNODE_H__
+#define __JNODE_H__
+
+#include "forward.h"
+#include "type_safe_hash.h"
+#include "txnmgr.h"
+#include "key.h"
+#include "debug.h"
+#include "dformat.h"
+#include "page_cache.h"
+#include "context.h"
+
+#include "plugin/plugin.h"
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+
+/* declare hash table of jnodes (jnodes proper, that is, unformatted
+ nodes) */
+TYPE_SAFE_HASH_DECLARE(j, jnode);
+
+/* declare hash table of znodes */
+TYPE_SAFE_HASH_DECLARE(z, znode);
+
+struct jnode_key {
+ __u64 objectid;
+ unsigned long index;
+ struct address_space *mapping;
+};
+
+/*
+ Jnode is the "base class" of other nodes in reiser4. It is also happens to
+ be exactly the node we use for unformatted tree nodes.
+
+ Jnode provides following basic functionality:
+
+ . reference counting and indexing.
+
+ . integration with page cache. Jnode has ->pg reference to which page can
+ be attached.
+
+ . interface to transaction manager. It is jnode that is kept in transaction
+ manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
+ means, there should be special type of jnode for inode.)
+
+ Locking:
+
+ Spin lock: the following fields are protected by the per-jnode spin lock:
+
+ ->state
+ ->atom
+ ->capture_link
+
+ Following fields are protected by the global tree lock:
+
+ ->link
+ ->key.z (content of ->key.z is only changed in znode_rehash())
+ ->key.j
+
+ Atomic counters
+
+ ->x_count
+ ->d_count
+
+ ->pg, and ->data are protected by spin lock for unused jnode and are
+ immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
+ is false).
+
+ ->tree is immutable after creation
+
+ Unclear
+
+ ->blocknr: should be under jnode spin-lock, but current interface is based
+ on passing of block address.
+
+ If you ever need to spin lock two nodes at once, do this in "natural"
+ memory order: lock znode with lower address first. (See lock_two_nodes().)
+
+ Invariants involving this data-type:
+
+ [jnode-dirty]
+ [jnode-refs]
+ [jnode-oid]
+ [jnode-queued]
+ [jnode-atom-valid]
+ [jnode-page-binding]
+*/
+
+struct jnode {
+#if REISER4_DEBUG
+#define JMAGIC 0x52654973 /* "ReIs" */
+ int magic;
+#endif
+ /* FIRST CACHE LINE (16 bytes): data used by jload */
+
+ /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
+ /* 0 */ unsigned long state;
+
+ /* lock, protecting jnode's fields. */
+ /* 4 */ spinlock_t load;
+
+ /* counter of references to jnode itself. Increased on jref().
+ Decreased on jput().
+ */
+ /* 8 */ atomic_t x_count;
+
+ /* counter of references to jnode's data. Pin data page(s) in
+ memory while this is greater than 0. Increased on jload().
+ Decreased on jrelse().
+ */
+ /* 12 */ atomic_t d_count;
+
+ /* SECOND CACHE LINE: data used by hash table lookups */
+
+ /* 16 */ union {
+ /* znodes are hashed by block number */
+ reiser4_block_nr z;
+ /* unformatted nodes are hashed by mapping plus offset */
+ struct jnode_key j;
+ } key;
+
+ /* THIRD CACHE LINE */
+
+ /* 32 */ union {
+ /* pointers to maintain hash-table */
+ z_hash_link z;
+ j_hash_link j;
+ } link;
+
+ /* pointer to jnode page. */
+ /* 36 */ struct page *pg;
+ /*
+ * Pointer to node's content.
+ * This is page_address(node->pg) when page is attached to the jnode
+ */
+ /* 40 */ void *data;
+
+ /* Subvolume, where IO is going to/from.
+ The pair (subvol, blocknr) defines "IO address"
+ */
+ /* 44 */ struct reiser4_subvol *subvol;
+
+ /* FOURTH CACHE LINE: atom related fields */
+
+ /* 48 */ spinlock_t guard;
+
+ /* atom the block is in, if any */
+ /* 52 */ txn_atom *atom;
+
+ /* capture list */
+ /* 56 */ struct list_head capture_link;
+
+ /* FIFTH CACHE LINE */
+
+ /* 64 */ struct rcu_head rcu;
+ /* crosses cache line */
+
+ /* SIXTH CACHE LINE */
+
+ /* the real blocknr (where io is going to/from) */
+ /* 80 */ reiser4_block_nr blocknr;
+ /* Parent item type, unformatted and CRC need it for
+ * offset => key conversion. */
+ /* NOTE: this parent_item_id looks like jnode type. */
+ /* 88 */ reiser4_plugin_id parent_item_id;
+ /* wait on JNODE_LOADING_IN_PROGRESS flag */
+ /* 92 */ wait_queue_head_t wait_jload;
+ /* 116 */ struct super_block *super;
+#if REISER4_DEBUG
+ /* list of all jnodes for debugging purposes. */
+ struct list_head jnodes;
+ /* how many times this jnode was written in one transaction */
+ int written;
+ /* this indicates which atom's list the jnode is on */
+ atom_list list;
+#endif
+} __attribute__ ((aligned(16)));
+
+/*
+ * jnode types. Enumeration of existing jnode types.
+ */
+typedef enum {
+ JNODE_UNFORMATTED_BLOCK, /* unformatted block */
+ JNODE_FORMATTED_BLOCK, /* formatted block, znode */
+ JNODE_BITMAP, /* bitmap */
+ JNODE_IO_HEAD, /* jnode representing a block in the
+ * wandering log */
+ JNODE_VOLINFO_HEAD, /* jnode representing a block of logical
+ volume system information */
+ LAST_JNODE_TYPE
+} jnode_type;
+
+/* jnode states */
+typedef enum {
+ /* jnode's page is loaded and data checked */
+ JNODE_PARSED = 0,
+ /* node was deleted, not all locks on it were released. This
+ node is empty and is going to be removed from the tree
+ shortly. */
+ JNODE_HEARD_BANSHEE = 1,
+ /* left sibling pointer is valid */
+ JNODE_LEFT_CONNECTED = 2,
+ /* right sibling pointer is valid */
+ JNODE_RIGHT_CONNECTED = 3,
+
+ /* znode was just created and doesn't yet have a pointer from
+ its parent */
+ JNODE_ORPHAN = 4,
+
+ /* this node was created by its transaction and has not been assigned
+ a block address. */
+ JNODE_CREATED = 5,
+
+ /* this node is currently relocated */
+ JNODE_RELOC = 6,
+ /* this node is currently wandered */
+ JNODE_OVRWR = 7,
+
+ /* this znode has been modified */
+ JNODE_DIRTY = 8,
+
+ /* znode lock is being invalidated */
+ JNODE_IS_DYING = 9,
+
+ /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
+
+ /* jnode is queued for flushing. */
+ JNODE_FLUSH_QUEUED = 12,
+
+ /* In the following bits jnode type is encoded. */
+ JNODE_TYPE_1 = 13,
+ JNODE_TYPE_2 = 14,
+ JNODE_TYPE_3 = 15,
+
+ /* jnode is being destroyed */
+ JNODE_RIP = 16,
+
+ /* znode was not captured during locking (it might so be because
+ ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
+ JNODE_MISSED_IN_CAPTURE = 17,
+
+ /* write is in progress */
+ JNODE_WRITEBACK = 18,
+
+ /* indicates that someone has already started to load jnode,
+ so that other processes should wait on this flag */
+ JNODE_LOADING_IN_PROGRESS = 19,
+
+ /* delimiting keys are already set for this znode. */
+ JNODE_DKSET = 20,
+
+ /* when this bit is set page and jnode can not be disconnected */
+ JNODE_WRITE_PREPARED = 21,
+
+ JNODE_CLUSTER_PAGE = 22,
+ /* Jnode is marked for repacking, that means the reiser4 flush and the
+ * block allocator should process this node special way */
+ JNODE_REPACK = 23,
+ /* node should be converted by flush in squalloc phase */
+ JNODE_CONVERTIBLE = 24,
+ /* jnode parsing failed */
+ JNODE_PARSING_FAILED = 25,
+ /*
+ * When jnode is dirtied for the first time in given transaction,
+ * do_jnode_make_dirty() checks whether this jnode can possible became
+ * member of overwrite set. If so, this bit is set, and one block is
+ * reserved in the ->flush_reserved space of atom.
+ *
+ * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
+ *
+ * (1) flush decides that we want this block to go into relocate
+ * set after all.
+ *
+ * (2) wandering log is allocated (by log writer)
+ *
+ * (3) extent is allocated
+ *
+ */
+ JNODE_FLUSH_RESERVED = 29
+} reiser4_jnode_state;
+
+/* Macros for accessing the jnode state. */
+
+static inline void JF_CLR(jnode * j, int f)
+{
+ assert("unknown-1", j->magic == JMAGIC);
+ clear_bit(f, &j->state);
+}
+static inline int JF_ISSET(const jnode * j, int f)
+{
+ assert("unknown-2", j->magic == JMAGIC);
+ return test_bit(f, &((jnode *) j)->state);
+}
+static inline void JF_SET(jnode * j, int f)
+{
+ assert("unknown-3", j->magic == JMAGIC);
+ set_bit(f, &j->state);
+}
+
+static inline int JF_TEST_AND_SET(jnode * j, int f)
+{
+ assert("unknown-4", j->magic == JMAGIC);
+ return test_and_set_bit(f, &j->state);
+}
+
+static inline void spin_lock_jnode(jnode *node)
+{
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
+ LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(spin_locked_zlock) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_LT(spin_locked_jnode, 2)));
+
+ spin_lock(&(node->guard));
+
+ LOCK_CNT_INC(spin_locked_jnode);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void spin_unlock_jnode(jnode *node)
+{
+ assert_spin_locked(&(node->guard));
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(spin_locked_jnode);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&(node->guard));
+}
+
+static inline int jnode_is_in_deleteset(const jnode * node)
+{
+ return JF_ISSET(node, JNODE_RELOC);
+}
+
+extern int init_jnodes(void);
+extern void done_jnodes(void);
+
+/* Jnode routines */
+extern jnode *jalloc(void);
+extern void jfree(jnode * node) NONNULL;
+extern jnode *jclone(jnode *);
+extern jnode *jlookup(oid_t objectid, unsigned long ind) NONNULL;
+extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
+extern jnode *jnode_by_page(struct page *pg) NONNULL;
+extern jnode *jnode_of_page(struct page *pg) NONNULL;
+void jnode_attach_page(jnode * node, struct page *pg);
+
+void unhash_unformatted_jnode(jnode *);
+extern jnode *page_next_jnode(jnode * node) NONNULL;
+extern void jnode_init(jnode *node,
+ struct reiser4_subvol *sub, jnode_type);
+extern void jnode_init_tail(jnode *node) NONNULL;
+extern void jnode_make_dirty(jnode * node) NONNULL;
+extern void jnode_make_clean(jnode * node) NONNULL;
+extern void jnode_make_wander_nolock(jnode * node) NONNULL;
+extern void jnode_make_wander(jnode *) NONNULL;
+extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL;
+extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
+extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
+
+static inline reiser4_subvol *jnode_get_subvol(const jnode *node)
+{
+ assert("edward-1871", node != NULL);
+ assert("edward-1872", node->subvol != NULL);
+
+ return node->subvol;
+}
+
+static inline void jnode_set_subvol(jnode *node, reiser4_subvol *subv)
+{
+ assert("edward-2222", node != NULL);
+ assert("edward-2224", subv != NULL);
+ assert("edward-2223", ergo(node->subvol != NULL, node->subvol == subv));
+
+ node->subvol = subv;
+}
+
+#define jnode_get_super(node) ((node)->super)
+
+static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
+{
+ assert("nikita-528", node != NULL);
+
+ return &node->blocknr;
+}
+
+static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
+{
+ assert("nikita-2020", node != NULL);
+ assert("umka-055", blocknr != NULL);
+ node->blocknr = *blocknr;
+}
+
+/**
+ * block number for IO. Usually this is the same as jnode_get_block(),
+ * unless jnode was emergency flushed - then block number chosen by
+ * eflush is used
+ */
+static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
+{
+ assert("nikita-2768", node != NULL);
+ assert_spin_locked(&(node->guard));
+
+ return jnode_get_block(node);
+}
+
+/* Jnode flush interface. */
+extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos);
+
+/* FIXME-VS: these are used in plugin/item/extent.c */
+
+/* does extent_get_block have to be called */
+#define jnode_mapped(node) JF_ISSET (node, JNODE_MAPPED)
+#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
+
+/* the node should be converted during flush squalloc phase */
+#define jnode_convertible(node) JF_ISSET (node, JNODE_CONVERTIBLE)
+#define jnode_set_convertible(node) JF_SET (node, JNODE_CONVERTIBLE)
+
+/* Macros to convert from jnode to znode, znode to jnode. These are macros
+ because C doesn't allow overloading of const prototypes. */
+#define ZJNODE(x) (&(x)->zjnode)
+#define JZNODE(x) \
+({ \
+ typeof(x) __tmp_x; \
+ \
+ __tmp_x = (x); \
+ assert("jmacd-1300", jnode_is_znode(__tmp_x)); \
+ (znode*) __tmp_x; \
+})
+
+extern int reiser4_jnodes_init(void);
+extern int reiser4_jnodes_done(void);
+
+#if REISER4_DEBUG
+
+extern int znode_is_any_locked(const znode * node);
+extern void jnode_list_remove(jnode * node);
+
+#else
+
+#define jnode_list_remove(node) noop
+
+#endif
+
+int znode_is_root(const znode * node) NONNULL;
+
+/* bump reference counter on @node */
+static inline void add_x_ref(jnode * node/* node to increase x_count of */)
+{
+ assert("nikita-1911", node != NULL);
+
+ atomic_inc(&node->x_count);
+ LOCK_CNT_INC(x_refs);
+}
+
+static inline void dec_x_ref(jnode * node)
+{
+ assert("nikita-3215", node != NULL);
+ assert("nikita-3216", atomic_read(&node->x_count) > 0);
+
+ atomic_dec(&node->x_count);
+ assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
+ LOCK_CNT_DEC(x_refs);
+}
+
+/* jref() - increase counter of references to jnode/znode (x_count) */
+static inline jnode *jref(jnode * node)
+{
+ assert("jmacd-508", (node != NULL) && !IS_ERR(node));
+ add_x_ref(node);
+ return node;
+}
+
+/* get the page of jnode */
+static inline struct page *jnode_page(const jnode * node)
+{
+ return node->pg;
+}
+
+/* return pointer to jnode data */
+static inline char *jdata(const jnode * node)
+{
+ assert("nikita-1415", node != NULL);
+ assert("nikita-3198", jnode_page(node) != NULL);
+ return node->data;
+}
+
+static inline int jnode_is_loaded(const jnode * node)
+{
+ assert("zam-506", node != NULL);
+ return atomic_read(&node->d_count) > 0;
+}
+
+extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
+
+static inline void jnode_set_reloc(jnode * node)
+{
+ assert("nikita-2431", node != NULL);
+ assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
+ JF_SET(node, JNODE_RELOC);
+}
+
+/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
+
+extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
+
+static inline int jload(jnode *node)
+{
+ return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
+}
+
+extern int jinit_new(jnode *, gfp_t) NONNULL;
+extern int jstartio(jnode *) NONNULL;
+
+extern int jdrop(jnode *) NONNULL;
+extern int jwait_io(jnode *, int rw) NONNULL;
+
+void jload_prefetch(jnode *);
+
+extern jnode *reiser4_alloc_io_head(const reiser4_block_nr *block,
+ reiser4_subvol *subv) NONNULL;
+extern jnode *reiser4_alloc_volinfo_head(const reiser4_block_nr *block,
+ reiser4_subvol *subv) NONNULL;
+extern void reiser4_drop_io_head(jnode * node) NONNULL;
+extern void reiser4_drop_volinfo_head(jnode * node) NONNULL;
+extern void pin_jnode_data(jnode *);
+extern void unpin_jnode_data(jnode *);
+
+static inline jnode_type jnode_get_type(const jnode * node)
+{
+ static const unsigned long state_mask =
+ (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
+
+ static jnode_type mask_to_type[] = {
+ /* JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
+
+ /* 000 */
+ [0] = JNODE_FORMATTED_BLOCK,
+ /* 001 */
+ [1] = JNODE_UNFORMATTED_BLOCK,
+ /* 010 */
+ [2] = JNODE_BITMAP,
+ /* 011 */
+ [3] = LAST_JNODE_TYPE, /*invalid */
+ /* 100 */
+ [4] = JNODE_VOLINFO_HEAD,
+ /* 101 */
+ [5] = LAST_JNODE_TYPE,
+ /* 110 */
+ [6] = JNODE_IO_HEAD,
+ /* 111 */
+ [7] = LAST_JNODE_TYPE, /* invalid */
+ };
+
+ return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
+}
+
+/* returns true if node is a znode */
+static inline int jnode_is_znode(const jnode * node)
+{
+ return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
+}
+
+static inline int jnode_is_flushprepped(jnode * node)
+{
+ assert("jmacd-78212", node != NULL);
+ assert_spin_locked(&(node->guard));
+ return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
+ JF_ISSET(node, JNODE_OVRWR);
+}
+
+/* Return true if @node has already been processed by the squeeze and allocate
+ process. This implies the block address has been finalized for the
+ duration of this atom (or it is clean and will remain in place). If this
+ returns true you may use the block number as a hint. */
+static inline int jnode_check_flushprepped(jnode * node)
+{
+ int result;
+
+ /* It must be clean or relocated or wandered. New allocations are set
+ * to relocate. */
+ spin_lock_jnode(node);
+ result = jnode_is_flushprepped(node);
+ spin_unlock_jnode(node);
+ return result;
+}
+
+/* returns true if node is unformatted */
+static inline int jnode_is_unformatted(const jnode * node)
+{
+ assert("jmacd-0123", node != NULL);
+ return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
+}
+
+/* returns true if node represents a cluster cache page */
+static inline int jnode_is_cluster_page(const jnode * node)
+{
+ assert("edward-50", node != NULL);
+ return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
+}
+
+/* returns true is node is builtin inode's jnode */
+static inline int jnode_is_volinfo_head(const jnode * node)
+{
+ assert("vs-1240", node != NULL);
+ return jnode_get_type(node) == JNODE_VOLINFO_HEAD;
+}
+
+static inline jnode_plugin *jnode_ops_of(const jnode_type type)
+{
+ assert("nikita-2367", type < LAST_JNODE_TYPE);
+ return jnode_plugin_by_id((reiser4_plugin_id) type);
+}
+
+static inline jnode_plugin *jnode_ops(const jnode * node)
+{
+ assert("nikita-2366", node != NULL);
+
+ return jnode_ops_of(jnode_get_type(node));
+}
+
+/* Get the index of a block. */
+static inline unsigned long jnode_get_index(jnode * node)
+{
+ return jnode_ops(node)->index(node);
+}
+
+/* return true if "node" is the root */
+static inline int jnode_is_root(const jnode * node)
+{
+ return jnode_is_znode(node) && znode_is_root(JZNODE(node));
+}
+
+extern struct address_space *mapping_jnode(const jnode * node);
+extern unsigned long index_jnode(const jnode * node);
+
+static inline void jput(jnode * node);
+extern void jput_final(jnode *node);
+
+/* bump data counter on @node */
+static inline void add_d_ref(jnode * node/* node to increase d_count of */)
+{
+ assert("nikita-1962", node != NULL);
+
+ atomic_inc(&node->d_count);
+ if (jnode_is_unformatted(node) || jnode_is_znode(node))
+ LOCK_CNT_INC(d_refs);
+}
+
+/* jput() - decrement x_count reference counter on znode.
+
+ Count may drop to 0, jnode stays in cache until memory pressure causes the
+ eviction of its page. The c_count variable also ensures that children are
+ pressured out of memory before the parent. The jnode remains hashed as
+ long as the VM allows its page to stay in memory.
+*/
+static inline void jput(jnode * node)
+{
+ assert("jmacd-509", node != NULL);
+ assert("jmacd-510", atomic_read(&node->x_count) > 0);
+ assert("zam-926", reiser4_schedulable());
+ LOCK_CNT_DEC(x_refs);
+
+ rcu_read_lock();
+ /*
+ * we don't need any kind of lock here--jput_final() uses RCU.
+ */
+ if (unlikely(atomic_dec_and_test(&node->x_count)))
+ jput_final(node);
+ else
+ rcu_read_unlock();
+ assert("nikita-3473", reiser4_schedulable());
+}
+
+extern void jrelse(jnode * node);
+extern void jrelse_tail(jnode * node);
+
+extern jnode *jnode_rip_sync(jnode *node);
+
+/* resolve race with jput */
+static inline jnode *jnode_rip_check(jnode *node)
+{
+ if (unlikely(JF_ISSET(node, JNODE_RIP)))
+ node = jnode_rip_sync(node);
+ return node;
+}
+
+static inline jnode *jnode_by_link(struct list_head *link)
+{
+ return list_entry(link, jnode, capture_link);
+}
+
+extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
+
+#if REISER4_DEBUG
+extern int jnode_invariant_f(const jnode *node, char const **msg);
+#endif
+
+extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
+
+/* __JNODE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/kassign.c linux-5.10.2/fs/reiser4/kassign.c
--- linux-5.10.2.orig/fs/reiser4/kassign.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/kassign.c 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,670 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Key assignment policy implementation */
+
+/*
+ * In reiser4 every piece of file system data and meta-data has a key. Keys
+ * are used to store information in and retrieve it from reiser4 internal
+ * tree. In addition to this, keys define _ordering_ of all file system
+ * information: things having close keys are placed into the same or
+ * neighboring (in the tree order) nodes of the tree. As our block allocator
+ * tries to respect tree order (see flush.c), keys also define order in which
+ * things are laid out on the disk, and hence, affect performance directly.
+ *
+ * Obviously, assignment of keys to data and meta-data should be consistent
+ * across whole file system. Algorithm that calculates a key for a given piece
+ * of data or meta-data is referred to as "key assignment".
+ *
+ * Key assignment is too expensive to be implemented as a plugin (that is,
+ * with an ability to support different key assignment schemas in the same
+ * compiled kernel image). As a compromise, all key-assignment functions and
+ * data-structures are collected in this single file, so that modifications to
+ * key assignment algorithm can be localized. Additional changes may be
+ * required in key.[ch].
+ *
+ * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
+ * may guess, there is "Plan B" too.
+ *
+ */
+
+/*
+ * Additional complication with key assignment implementation is a requirement
+ * to support different key length.
+ */
+
+/*
+ * KEY ASSIGNMENT: PLAN A, LONG KEYS.
+ *
+ * DIRECTORY ITEMS
+ *
+ * | 60 | 4 | 7 |1| 56 | 64 | 64 |
+ * +--------------+---+---+-+-------------+------------------+-----------------+
+ * | dirid | 0 | F |H| prefix-1 | prefix-2 | prefix-3/hash |
+ * +--------------+---+---+-+-------------+------------------+-----------------+
+ * | | | | |
+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * dirid objectid of directory this item is for
+ *
+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
+ *
+ * H 1 if last 8 bytes of the key contain hash,
+ * 0 if last 8 bytes of the key contain prefix-3
+ *
+ * prefix-1 first 7 characters of file name.
+ * Padded by zeroes if name is not long enough.
+ *
+ * prefix-2 next 8 characters of the file name.
+ *
+ * prefix-3 next 8 characters of the file name.
+ *
+ * hash hash of the rest of file name (i.e., portion of file
+ * name not included into prefix-1 and prefix-2).
+ *
+ * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
+ * in the key. Such file names are called "short". They are distinguished by H
+ * bit set 0 in the key.
+ *
+ * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
+ * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
+ * key. Last 8 bytes of the key are occupied by hash of the remaining
+ * characters of the name.
+ *
+ * This key assignment reaches following important goals:
+ *
+ * (1) directory entries are sorted in approximately lexicographical
+ * order.
+ *
+ * (2) collisions (when multiple directory items have the same key), while
+ * principally unavoidable in a tree with fixed length keys, are rare.
+ *
+ * STAT DATA
+ *
+ * | 60 | 4 | 64 | 4 | 60 | 64 |
+ * +--------------+---+-----------------+---+--------------+-----------------+
+ * | locality id | 1 | ordering | 0 | objectid | 0 |
+ * +--------------+---+-----------------+---+--------------+-----------------+
+ * | | | | |
+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * locality id object id of a directory where first name was created for
+ * the object
+ *
+ * ordering copy of second 8-byte portion of the key of directory
+ * entry for the first name of this object. Ordering has a form
+ * {
+ * fibration :7;
+ * h :1;
+ * prefix1 :56;
+ * }
+ * see description of key for directory entry above.
+ *
+ * objectid object id for this object
+ *
+ * This key assignment policy is designed to keep stat-data in the same order
+ * as corresponding directory items, thus speeding up readdir/stat types of
+ * workload.
+ *
+ * FILE BODY
+ *
+ * | 60 | 4 | 64 | 4 | 60 | 64 |
+ * +--------------+---+-----------------+---+--------------+-----------------+
+ * | locality id | 4 | ordering | 0 | objectid | offset |
+ * +--------------+---+-----------------+---+--------------+-----------------+
+ * | | | | |
+ * | 8 bytes | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * locality id object id of a directory where first name was created for
+ * the object
+ *
+ * ordering the same as in the key of stat-data for this object
+ *
+ * objectid object id for this object
+ *
+ * offset logical offset from the beginning of this file.
+ * Measured in bytes.
+ *
+ *
+ * KEY ASSIGNMENT: PLAN A, SHORT KEYS.
+ *
+ * DIRECTORY ITEMS
+ *
+ * | 60 | 4 | 7 |1| 56 | 64 |
+ * +--------------+---+---+-+-------------+-----------------+
+ * | dirid | 0 | F |H| prefix-1 | prefix-2/hash |
+ * +--------------+---+---+-+-------------+-----------------+
+ * | | | |
+ * | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * dirid objectid of directory this item is for
+ *
+ * F fibration, see fs/reiser4/plugin/fibration.[ch]
+ *
+ * H 1 if last 8 bytes of the key contain hash,
+ * 0 if last 8 bytes of the key contain prefix-2
+ *
+ * prefix-1 first 7 characters of file name.
+ * Padded by zeroes if name is not long enough.
+ *
+ * prefix-2 next 8 characters of the file name.
+ *
+ * hash hash of the rest of file name (i.e., portion of file
+ * name not included into prefix-1).
+ *
+ * File names shorter than 15 (== 7 + 8) characters are completely encoded in
+ * the key. Such file names are called "short". They are distinguished by H
+ * bit set in the key.
+ *
+ * Other file names are "long". For long name, H bit is 0, and first 7
+ * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
+ * key are occupied by hash of the remaining characters of the name.
+ *
+ * STAT DATA
+ *
+ * | 60 | 4 | 4 | 60 | 64 |
+ * +--------------+---+---+--------------+-----------------+
+ * | locality id | 1 | 0 | objectid | 0 |
+ * +--------------+---+---+--------------+-----------------+
+ * | | | |
+ * | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * locality id object id of a directory where first name was created for
+ * the object
+ *
+ * objectid object id for this object
+ *
+ * FILE BODY
+ *
+ * | 60 | 4 | 4 | 60 | 64 |
+ * +--------------+---+---+--------------+-----------------+
+ * | locality id | 4 | 0 | objectid | offset |
+ * +--------------+---+---+--------------+-----------------+
+ * | | | |
+ * | 8 bytes | 8 bytes | 8 bytes |
+ *
+ * locality id object id of a directory where first name was created for
+ * the object
+ *
+ * objectid object id for this object
+ *
+ * offset logical offset from the beginning of this file.
+ * Measured in bytes.
+ *
+ *
+ */
+
+#include "debug.h"
+#include "key.h"
+#include "kassign.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "super.h"
+#include "dscale.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block, etc */
+
+/* bitmask for H bit (see comment at the beginning of this file */
+static const __u64 longname_mark = 0x0100000000000000ull;
+/* bitmask for F and H portions of the key. */
+static const __u64 fibration_mask = 0xff00000000000000ull;
+
+/* return true if name is not completely encoded in @key */
+int is_longname_key(const reiser4_key * key)
+{
+ __u64 highpart;
+
+ assert("nikita-2863", key != NULL);
+ if (get_key_type(key) != KEY_FILE_NAME_MINOR)
+ reiser4_print_key("oops", key);
+ assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
+
+ if (REISER4_LARGE_KEY)
+ highpart = get_key_ordering(key);
+ else
+ highpart = get_key_objectid(key);
+
+ return (highpart & longname_mark) ? 1 : 0;
+}
+
+/* return true if @name is too long to be completely encoded in the key */
+int is_longname(const char *name UNUSED_ARG, int len)
+{
+ if (REISER4_LARGE_KEY)
+ return len > 23;
+ else
+ return len > 15;
+}
+
+/* code ascii string into __u64.
+
+ Put characters of @name into result (@str) one after another starting
+ from @start_idx-th highest (arithmetically) byte. This produces
+ endian-safe encoding. memcpy(2) will not do.
+
+*/
+static __u64 pack_string(const char *name /* string to encode */ ,
+ int start_idx /* highest byte in result from
+ * which to start encoding */ )
+{
+ unsigned i;
+ __u64 str;
+
+ str = 0;
+ for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
+ str <<= 8;
+ str |= (unsigned char)name[i];
+ }
+ str <<= (sizeof str - i - start_idx) << 3;
+ return str;
+}
+
+/* opposite to pack_string(). Takes value produced by pack_string(), restores
+ * string encoded in it and stores result in @buf */
+char *reiser4_unpack_string(__u64 value, char *buf)
+{
+ do {
+ *buf = value >> (64 - 8);
+ if (*buf)
+ ++buf;
+ value <<= 8;
+ } while (value != 0);
+ *buf = 0;
+ return buf;
+}
+
+/* obtain name encoded in @key and store it in @buf */
+char *extract_name_from_key(const reiser4_key * key, char *buf)
+{
+ char *c;
+
+ assert("nikita-2868", !is_longname_key(key));
+
+ c = buf;
+ if (REISER4_LARGE_KEY) {
+ c = reiser4_unpack_string(get_key_ordering(key) &
+ ~fibration_mask, c);
+ c = reiser4_unpack_string(get_key_fulloid(key), c);
+ } else
+ c = reiser4_unpack_string(get_key_fulloid(key) &
+ ~fibration_mask, c);
+ reiser4_unpack_string(get_key_offset(key), c);
+ return buf;
+}
+
+/**
+ * complete_entry_key - calculate entry key by name
+ * @dir: directory where entry is (or will be) in
+ * @name: name to calculate key of
+ * @len: lenth of name
+ * @result: place to store result in
+ *
+ * Sets fields of entry key @result which depend on file name.
+ * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
+ * objectid and offset. Otherwise, objectid and offset are set.
+ */
+void complete_entry_key(const struct inode *dir, const char *name,
+ int len, reiser4_key *result)
+{
+#if REISER4_LARGE_KEY
+ __u64 ordering;
+ __u64 objectid;
+ __u64 offset;
+
+ assert("nikita-1139", dir != NULL);
+ assert("nikita-1142", result != NULL);
+ assert("nikita-2867", strlen(name) == len);
+
+ /*
+ * key allocation algorithm for directory entries in case of large
+ * keys:
+ *
+ * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
+ * characters into ordering field of key, next 8 charactes (if any)
+ * into objectid field of key and next 8 ones (of any) into offset
+ * field of key
+ *
+ * If file name is longer than 23 characters, put first 7 characters
+ * into key's ordering, next 8 to objectid and hash of remaining
+ * characters into offset field.
+ *
+ * To distinguish above cases, in latter set up unused high bit in
+ * ordering field.
+ */
+
+ /* [0-6] characters to ordering */
+ ordering = pack_string(name, 1);
+ if (len > 7) {
+ /* [7-14] characters to objectid */
+ objectid = pack_string(name + 7, 0);
+ if (len > 15) {
+ if (len <= 23) {
+ /* [15-23] characters to offset */
+ offset = pack_string(name + 15, 0);
+ } else {
+ /* note in a key the fact that offset contains
+ * hash */
+ ordering |= longname_mark;
+
+ /* offset is the hash of the file name's tail */
+ offset = inode_hash_plugin(dir)->hash(name + 15,
+ len - 15);
+ }
+ } else {
+ offset = 0ull;
+ }
+ } else {
+ objectid = 0ull;
+ offset = 0ull;
+ }
+
+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
+ ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
+
+ set_key_ordering(result, ordering);
+ set_key_fulloid(result, objectid);
+ set_key_offset(result, offset);
+ return;
+
+#else
+ __u64 objectid;
+ __u64 offset;
+
+ assert("nikita-1139", dir != NULL);
+ assert("nikita-1142", result != NULL);
+ assert("nikita-2867", strlen(name) == len);
+
+ /*
+ * key allocation algorithm for directory entries in case of not large
+ * keys:
+ *
+ * If name is not longer than 7 + 8 = 15 characters, put first 7
+ * characters into objectid field of key, next 8 charactes (if any)
+ * into offset field of key
+ *
+ * If file name is longer than 15 characters, put first 7 characters
+ * into key's objectid, and hash of remaining characters into offset
+ * field.
+ *
+ * To distinguish above cases, in latter set up unused high bit in
+ * objectid field.
+ */
+
+ /* [0-6] characters to objectid */
+ objectid = pack_string(name, 1);
+ if (len > 7) {
+ if (len <= 15) {
+ /* [7-14] characters to offset */
+ offset = pack_string(name + 7, 0);
+ } else {
+ /* note in a key the fact that offset contains hash. */
+ objectid |= longname_mark;
+
+ /* offset is the hash of the file name. */
+ offset = inode_hash_plugin(dir)->hash(name + 7,
+ len - 7);
+ }
+ } else
+ offset = 0ull;
+
+ assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
+ objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
+
+ set_key_fulloid(result, objectid);
+ set_key_offset(result, offset);
+ return;
+#endif /* ! REISER4_LARGE_KEY */
+}
+
+/* true, if @key is the key of "." */
+int is_dot_key(const reiser4_key * key/* key to check */)
+{
+ assert("nikita-1717", key != NULL);
+ assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
+ return
+ (get_key_ordering(key) == 0ull) &&
+ (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
+}
+
+/* build key for stat-data.
+
+ return key of stat-data of this object. This should became sd plugin
+ method in the future. For now, let it be here.
+
+*/
+reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ ,
+ reiser4_key * result /* resulting key of @target
+ stat-data */ )
+{
+ assert("nikita-261", result != NULL);
+
+ reiser4_key_init(result);
+ set_key_locality(result, reiser4_inode_data(target)->locality_id);
+ set_key_ordering(result, get_inode_ordering(target));
+ set_key_objectid(result, get_inode_oid(target));
+ set_key_type(result, KEY_SD_MINOR);
+ set_key_offset(result, (__u64) 0);
+ return result;
+}
+
+/* encode part of key into &obj_key_id
+
+ This encodes into @id part of @key sufficient to restore @key later,
+ given that latter is key of object (key of stat-data).
+
+ See &obj_key_id
+*/
+int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
+ obj_key_id * id/* id where key is encoded in */)
+{
+ assert("nikita-1151", key != NULL);
+ assert("nikita-1152", id != NULL);
+
+ memcpy(id, key, sizeof *id);
+ return 0;
+}
+
+/* encode reference to @obj in @id.
+
+ This is like build_obj_key_id() above, but takes inode as parameter. */
+int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
+ obj_key_id * id/* result */)
+{
+ reiser4_key sdkey;
+
+ assert("nikita-1166", obj != NULL);
+ assert("nikita-1167", id != NULL);
+
+ build_sd_key(obj, &sdkey);
+ build_obj_key_id(&sdkey, id);
+ return 0;
+}
+
+/* decode @id back into @key
+
+ Restore key of object stat-data from @id. This is dual to
+ build_obj_key_id() above.
+*/
+int extract_key_from_id(const obj_key_id * id /* object key id to extract key
+ * from */ ,
+ reiser4_key * key/* result */)
+{
+ assert("nikita-1153", id != NULL);
+ assert("nikita-1154", key != NULL);
+
+ reiser4_key_init(key);
+ memcpy(key, id, sizeof *id);
+ return 0;
+}
+
+/* extract objectid of directory from key of directory entry within said
+ directory.
+ */
+oid_t extract_dir_id_from_key(const reiser4_key * de_key /* key of
+ * directory
+ * entry */ )
+{
+ assert("nikita-1314", de_key != NULL);
+ return get_key_locality(de_key);
+}
+
+/* encode into @id key of directory entry.
+
+ Encode into @id information sufficient to later distinguish directory
+ entries within the same directory. This is not whole key, because all
+ directory entries within directory item share locality which is equal
+ to objectid of their directory.
+
+*/
+int build_de_id(const struct inode *dir /* inode of directory */ ,
+ const struct qstr *name /* name to be given to @obj by
+ * directory entry being
+ * constructed */ ,
+ de_id * id/* short key of directory entry */)
+{
+ reiser4_key key;
+
+ assert("nikita-1290", dir != NULL);
+ assert("nikita-1292", id != NULL);
+
+ /* NOTE-NIKITA this is suboptimal. */
+ inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
+ return build_de_id_by_key(&key, id);
+}
+
+/* encode into @id key of directory entry.
+
+ Encode into @id information sufficient to later distinguish directory
+ entries within the same directory. This is not whole key, because all
+ directory entries within directory item share locality which is equal
+ to objectid of their directory.
+
+*/
+int build_de_id_by_key(const reiser4_key * entry_key /* full key of directory
+ * entry */ ,
+ de_id * id/* short key of directory entry */)
+{
+ memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
+ return 0;
+}
+
+/* restore from @id key of directory entry.
+
+ Function dual to build_de_id(): given @id and locality, build full
+ key of directory entry within directory item.
+
+*/
+int extract_key_from_de_id(const oid_t locality /* locality of directory
+ * entry */ ,
+ const de_id * id /* directory entry id */ ,
+ reiser4_key * key/* result */)
+{
+ /* no need to initialise key here: all fields are overwritten */
+ memcpy(((__u64 *) key) + 1, id, sizeof *id);
+ set_key_locality(key, locality);
+ set_key_type(key, KEY_FILE_NAME_MINOR);
+ return 0;
+}
+
+/* compare two &de_id's */
+cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
+ const de_id * id2/* second &de_id to compare */)
+{
+ /* NOTE-NIKITA ugly implementation */
+ reiser4_key k1;
+ reiser4_key k2;
+
+ extract_key_from_de_id((oid_t) 0, id1, &k1);
+ extract_key_from_de_id((oid_t) 0, id2, &k2);
+ return keycmp(&k1, &k2);
+}
+
+/* compare &de_id with key */
+cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
+ const reiser4_key * key/* key to compare */)
+{
+ reiser4_key *k1;
+
+ k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
+ return short_keycmp(k1, key);
+}
+
+/*
+ * return number of bytes necessary to encode @inode identity.
+ */
+int inode_onwire_size(const struct inode *inode)
+{
+ int result;
+
+ result = dscale_bytes_to_write(get_inode_oid(inode));
+ result += dscale_bytes_to_write(get_inode_locality(inode));
+
+ /*
+ * ordering is large (it usually has highest bits set), so it makes
+ * little sense to dscale it.
+ */
+ if (REISER4_LARGE_KEY)
+ result += sizeof(get_inode_ordering(inode));
+ return result;
+}
+
+/*
+ * encode @inode identity at @start
+ */
+char *build_inode_onwire(const struct inode *inode, char *start)
+{
+ start += dscale_write(start, get_inode_locality(inode));
+ start += dscale_write(start, get_inode_oid(inode));
+
+ if (REISER4_LARGE_KEY) {
+ put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
+ start += sizeof(get_inode_ordering(inode));
+ }
+ return start;
+}
+
+/*
+ * extract key that was previously encoded by build_inode_onwire() at @addr
+ */
+char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
+{
+ __u64 val;
+
+ addr += dscale_read(addr, &val);
+ val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
+ addr += dscale_read(addr, &val);
+ put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
+#if REISER4_LARGE_KEY
+ memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
+ addr += sizeof key_id->ordering;
+#endif
+ return addr;
+}
+
+/*
+ * skip a key that was previously encoded by build_inode_onwire() at @addr
+ * FIXME: handle IO errors.
+ */
+char * locate_obj_key_id_onwire(char * addr)
+{
+ /* locality */
+ addr += dscale_bytes_to_read(addr);
+ /* objectid */
+ addr += dscale_bytes_to_read(addr);
+#if REISER4_LARGE_KEY
+ addr += sizeof ((obj_key_id *)0)->ordering;
+#endif
+ return addr;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/kassign.h linux-5.10.2/fs/reiser4/kassign.h
--- linux-5.10.2.orig/fs/reiser4/kassign.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/kassign.h 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,111 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Key assignment policy interface. See kassign.c for details. */
+
+#if !defined(__KASSIGN_H__)
+#define __KASSIGN_H__
+
+#include "forward.h"
+#include "key.h"
+#include "dformat.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block, etc */
+#include <linux/dcache.h> /* for struct qstr */
+
+/* key assignment functions */
+
+/* Information from which key of file stat-data can be uniquely
+ restored. This depends on key assignment policy for
+ stat-data. Currently it's enough to store object id and locality id
+ (60+60==120) bits, because minor packing locality and offset of
+ stat-data key are always known constants: KEY_SD_MINOR and 0
+ respectively. For simplicity 4 bits are wasted in each id, and just
+ two 64 bit integers are stored.
+
+ This field has to be byte-aligned, because we don't want to waste
+ space in directory entries. There is another side of a coin of
+ course: we waste CPU and bus bandwidth in stead, by copying data back
+ and forth.
+
+ Next optimization: &obj_key_id is mainly used to address stat data from
+ directory entries. Under the assumption that majority of files only have
+ only name (one hard link) from *the* parent directory it seems reasonable
+ to only store objectid of stat data and take its locality from key of
+ directory item.
+
+ This requires some flag to be added to the &obj_key_id to distinguish
+ between these two cases. Remaining bits in flag byte are then asking to be
+ used to store file type.
+
+ This optimization requires changes in directory item handling code.
+
+*/
+typedef struct obj_key_id {
+ d8 locality[sizeof(__u64)];
+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
+ )
+ d8 objectid[sizeof(__u64)];
+}
+obj_key_id;
+
+/* Information sufficient to uniquely identify directory entry within
+ compressed directory item.
+
+ For alignment issues see &obj_key_id above.
+*/
+typedef struct de_id {
+ ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
+ d8 objectid[sizeof(__u64)];
+ d8 offset[sizeof(__u64)];
+}
+de_id;
+
+extern int inode_onwire_size(const struct inode *obj);
+extern char *build_inode_onwire(const struct inode *obj, char *area);
+extern char *locate_obj_key_id_onwire(char *area);
+extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
+
+extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
+extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
+extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
+extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
+extern int build_de_id(const struct inode *dir, const struct qstr *name,
+ de_id * id);
+extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
+extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
+ reiser4_key * key);
+extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
+extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
+
+extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
+extern void build_entry_key_common(const struct inode *dir,
+ const struct qstr *name,
+ reiser4_key * result);
+extern void build_entry_key_stable_entry(const struct inode *dir,
+ const struct qstr *name,
+ reiser4_key * result);
+extern int is_dot_key(const reiser4_key * key);
+extern reiser4_key *build_sd_key(const struct inode *target,
+ reiser4_key * result);
+
+extern int is_longname_key(const reiser4_key * key);
+extern int is_longname(const char *name, int len);
+extern char *extract_name_from_key(const reiser4_key * key, char *buf);
+extern char *reiser4_unpack_string(__u64 value, char *buf);
+extern void complete_entry_key(const struct inode *dir, const char *name,
+ int len, reiser4_key *result);
+
+/* __KASSIGN_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/Kconfig linux-5.10.2/fs/reiser4/Kconfig
--- linux-5.10.2.orig/fs/reiser4/Kconfig 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/Kconfig 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,45 @@
+config REISER4_FS
+ tristate "Reiser4 (EXPERIMENTAL)"
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ select CRYPTO
+ select CRYPTO_CRC32C
+ help
+ Reiser4 is a filesystem that performs all filesystem operations
+ as atomic transactions, which means that it either performs a
+ write, or it does not, and in the event of a crash it does not
+ partially perform it or corrupt it.
+
+ It stores files in dancing trees, which are like balanced trees but
+ faster. It packs small files together so that they share blocks
+ without wasting space. This means you can use it to store really
+ small files. It also means that it saves you disk space. It avoids
+ hassling you with anachronisms like having a maximum number of
+ inodes, and wasting space if you use less than that number.
+
+ Reiser4 is a distinct filesystem type from reiserfs (V3).
+ It's therefore not possible to use reiserfs file systems
+ with reiser4.
+
+ To learn more about reiser4, go to http://reiser4.wiki.kernel.org/
+
+config REISER4_OLD
+ bool "Enable Plan-A key allocation scheme"
+ depends on REISER4_FS
+ help
+ Say Y if you intend to mount old reiser4 partitions.
+ Note, that it will disable some new features like logical volumes.
+
+ If unsure, say N.
+
+config REISER4_DEBUG
+ bool "Enable reiser4 debug mode"
+ depends on REISER4_FS
+ help
+ Don't use this unless you are debugging reiser4.
+
+ If unsure, say N.
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/key.c linux-5.10.2/fs/reiser4/key.c
--- linux-5.10.2.orig/fs/reiser4/key.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/key.c 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,138 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Key manipulations. */
+
+#include "debug.h"
+#include "key.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h> /* for __u?? */
+
+/* Minimal possible key: all components are zero. It is presumed that this is
+ independent of key scheme. */
+static const reiser4_key MINIMAL_KEY = {
+ .el = {
+ 0ull,
+ ON_LARGE_KEY(0ull,)
+ 0ull,
+ 0ull
+ }
+};
+
+/* Maximal possible key: all components are ~0. It is presumed that this is
+ independent of key scheme. */
+static const reiser4_key MAXIMAL_KEY = {
+ .el = {
+ __constant_cpu_to_le64(~0ull),
+ ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
+ __constant_cpu_to_le64(~0ull),
+ __constant_cpu_to_le64(~0ull)
+ }
+};
+
+/* Initialize key. */
+void reiser4_key_init(reiser4_key * key/* key to init */)
+{
+ assert("nikita-1169", key != NULL);
+ memset(key, 0, sizeof *key);
+}
+
+/* minimal possible key in the tree. Return pointer to the static storage. */
+const reiser4_key * reiser4_min_key(void)
+{
+ return &MINIMAL_KEY;
+}
+
+/* maximum possible key in the tree. Return pointer to the static storage. */
+const reiser4_key * reiser4_max_key(void)
+{
+ return &MAXIMAL_KEY;
+}
+
+#if REISER4_DEBUG
+/* debugging aid: print symbolic name of key type */
+static const char *type_name(unsigned int key_type/* key type */)
+{
+ switch (key_type) {
+ case KEY_FILE_NAME_MINOR:
+ return "file name";
+ case KEY_SD_MINOR:
+ return "stat data";
+ case KEY_ATTR_NAME_MINOR:
+ return "attr name";
+ case KEY_ATTR_BODY_MINOR:
+ return "attr body";
+ case KEY_BODY_MINOR:
+ return "file body";
+ default:
+ return "unknown";
+ }
+}
+
+/* debugging aid: print human readable information about key */
+void reiser4_print_key(const char *prefix /* prefix to print */ ,
+ const reiser4_key * key/* key to print */)
+{
+ /* turn bold on */
+ /* printf ("\033[1m"); */
+ if (key == NULL)
+ printk("%s: null key\n", prefix);
+ else {
+ if (REISER4_LARGE_KEY)
+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
+ get_key_locality(key),
+ get_key_type(key),
+ get_key_ordering(key),
+ get_key_band(key),
+ get_key_objectid(key), get_key_offset(key));
+ else
+ printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
+ get_key_locality(key),
+ get_key_type(key),
+ get_key_band(key),
+ get_key_objectid(key), get_key_offset(key));
+ /*
+ * if this is a key of directory entry, try to decode part of
+ * a name stored in the key, and output it.
+ */
+ if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
+ char buf[DE_NAME_BUF_LEN];
+ char *c;
+
+ c = buf;
+ c = reiser4_unpack_string(get_key_ordering(key), c);
+ reiser4_unpack_string(get_key_fulloid(key), c);
+ printk("[%s", buf);
+ if (is_longname_key(key))
+ /*
+ * only part of the name is stored in the key.
+ */
+ printk("...]\n");
+ else {
+ /*
+ * whole name is stored in the key.
+ */
+ reiser4_unpack_string(get_key_offset(key), buf);
+ printk("%s]\n", buf);
+ }
+ } else {
+ printk("[%s]\n", type_name(get_key_type(key)));
+ }
+ }
+ /* turn bold off */
+ /* printf ("\033[m\017"); */
+}
+
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/key.h linux-5.10.2/fs/reiser4/key.h
--- linux-5.10.2.orig/fs/reiser4/key.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/key.h 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,445 @@
+/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Declarations of key-related data-structures and operations on keys. */
+
+#if !defined(__REISER4_KEY_H__)
+#define __REISER4_KEY_H__
+
+#include "dformat.h"
+#include "forward.h"
+#include "debug.h"
+
+#include <linux/prefetch.h>
+#include <linux/types.h> /* for __u?? */
+
+/* Operations on keys in reiser4 tree */
+
+/* No access to any of these fields shall be done except via a
+ wrapping macro/function, and that wrapping macro/function shall
+ convert to little endian order. Compare keys will consider cpu byte order. */
+
+/* A storage layer implementation difference between a regular unix file body
+ and its attributes is in the typedef below which causes all of the attributes
+ of a file to be near in key to all of the other attributes for all of the
+ files within that directory, and not near to the file itself. It is
+ interesting to consider whether this is the wrong approach, and whether there
+ should be no difference at all. For current usage patterns this choice is
+ probably the right one. */
+
+/* possible values for minor packing locality (4 bits required) */
+typedef enum {
+ /* file name */
+ KEY_FILE_NAME_MINOR = 0,
+ /* stat-data */
+ KEY_SD_MINOR = 1,
+ /* file attribute name */
+ KEY_ATTR_NAME_MINOR = 2,
+ /* file attribute value */
+ KEY_ATTR_BODY_MINOR = 3,
+ /* file body (tail or extent) */
+ KEY_BODY_MINOR = 4,
+} key_minor_locality;
+
+/* Everything stored in the tree has a unique key, which means that the tree is
+ (logically) fully ordered by key. Physical order is determined by dynamic
+ heuristics that attempt to reflect key order when allocating available space,
+ and by the repacker. It is stylistically better to put aggregation
+ information into the key. Thus, if you want to segregate extents from tails,
+ it is better to give them distinct minor packing localities rather than
+ changing block_alloc.c to check the node type when deciding where to allocate
+ the node.
+
+ The need to randomly displace new directories and large files disturbs this
+ symmetry unfortunately. However, it should be noted that this is a need that
+ is not clearly established given the existence of a repacker. Also, in our
+ current implementation tails have a different minor packing locality from
+ extents, and no files have both extents and tails, so maybe symmetry can be
+ had without performance cost after all. Symmetry is what we ship for now....
+*/
+
+/* Arbitrary major packing localities can be assigned to objects using
+ the reiser4(filenameA/..packing<=some_number) system call.
+
+ In reiser4, the creat() syscall creates a directory
+
+ whose default flow (that which is referred to if the directory is
+ read as a file) is the traditional unix file body.
+
+ whose directory plugin is the 'filedir'
+
+ whose major packing locality is that of the parent of the object created.
+
+ The static_stat item is a particular commonly used directory
+ compression (the one for normal unix files).
+
+ The filedir plugin checks to see if the static_stat item exists.
+ There is a unique key for static_stat. If yes, then it uses the
+ static_stat item for all of the values that it contains. The
+ static_stat item contains a flag for each stat it contains which
+ indicates whether one should look outside the static_stat item for its
+ contents.
+*/
+
+/* offset of fields in reiser4_key. Value of each element of this enum
+ is index within key (thought as array of __u64's) where this field
+ is. */
+typedef enum {
+ /* major "locale", aka dirid. Sits in 1st element */
+ KEY_LOCALITY_INDEX = 0,
+ /* minor "locale", aka item type. Sits in 1st element */
+ KEY_TYPE_INDEX = 0,
+ ON_LARGE_KEY(KEY_ORDERING_INDEX,)
+ /* "object band". Sits in 2nd element */
+ KEY_BAND_INDEX,
+ /* objectid. Sits in 2nd element */
+ KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
+ /* full objectid. Sits in 2nd element */
+ KEY_FULLOID_INDEX = KEY_BAND_INDEX,
+ /* Offset. Sits in 3rd element */
+ KEY_OFFSET_INDEX,
+ /* Name hash. Sits in 3rd element */
+ KEY_HASH_INDEX = KEY_OFFSET_INDEX,
+ KEY_CACHELINE_END = KEY_OFFSET_INDEX,
+ KEY_LAST_INDEX
+} reiser4_key_field_index;
+
+/* key in reiser4 internal "balanced" tree. It is just array of three
+ 64bit integers in disk byte order (little-endian by default). This
+ array is actually indexed by reiser4_key_field. Each __u64 within
+ this array is called "element". Logical key component encoded within
+ elements are called "fields".
+
+ We declare this as union with second component dummy to suppress
+ inconvenient array<->pointer casts implied in C. */
+union reiser4_key {
+ __le64 el[KEY_LAST_INDEX];
+ int pad;
+};
+
+/* bitmasks showing where within reiser4_key particular key is stored. */
+/* major locality occupies higher 60 bits of the first element */
+#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
+
+/* minor locality occupies lower 4 bits of the first element */
+#define KEY_TYPE_MASK 0xfull
+
+/* controversial band occupies higher 4 bits of the 2nd element */
+#define KEY_BAND_MASK 0xf000000000000000ull
+
+/* objectid occupies lower 60 bits of the 2nd element */
+#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
+
+/* full 64bit objectid*/
+#define KEY_FULLOID_MASK 0xffffffffffffffffull
+
+/* offset is just 3rd L.M.Nt itself */
+#define KEY_OFFSET_MASK 0xffffffffffffffffull
+
+/* ordering is whole second element */
+#define KEY_ORDERING_MASK 0xffffffffffffffffull
+
+/* how many bits key element should be shifted to left to get particular field
+ */
+typedef enum {
+ KEY_LOCALITY_SHIFT = 4,
+ KEY_TYPE_SHIFT = 0,
+ KEY_BAND_SHIFT = 60,
+ KEY_OBJECTID_SHIFT = 0,
+ KEY_FULLOID_SHIFT = 0,
+ KEY_OFFSET_SHIFT = 0,
+ KEY_ORDERING_SHIFT = 0,
+} reiser4_key_field_shift;
+
+static inline __u64
+get_key_el(const reiser4_key * key, reiser4_key_field_index off)
+{
+ assert("nikita-753", key != NULL);
+ assert("nikita-754", off < KEY_LAST_INDEX);
+ return le64_to_cpu(get_unaligned(&key->el[off]));
+}
+
+static inline void
+set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
+{
+ assert("nikita-755", key != NULL);
+ assert("nikita-756", off < KEY_LAST_INDEX);
+ put_unaligned(cpu_to_le64(value), &key->el[off]);
+}
+
+/* macro to define getter and setter functions for field F with type T */
+#define DEFINE_KEY_FIELD(L, U, T) \
+static inline T get_key_ ## L(const reiser4_key *key) \
+{ \
+ assert("nikita-750", key != NULL); \
+ return (T) (get_key_el(key, KEY_ ## U ## _INDEX) & \
+ KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT; \
+} \
+ \
+static inline void set_key_ ## L(reiser4_key * key, T loc) \
+{ \
+ __u64 el; \
+ \
+ assert("nikita-752", key != NULL); \
+ \
+ el = get_key_el(key, KEY_ ## U ## _INDEX); \
+ /* clear field bits in the key */ \
+ el &= ~KEY_ ## U ## _MASK; \
+ /* actually it should be \
+ \
+ el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK; \
+ \
+ but we trust user to never pass values that wouldn't fit \
+ into field. Clearing extra bits is one operation, but this \
+ function is time-critical. \
+ But check this in assertion. */ \
+ assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) & \
+ ~KEY_ ## U ## _MASK) == 0); \
+ el |= (loc << KEY_ ## U ## _SHIFT); \
+ set_key_el(key, KEY_ ## U ## _INDEX, el); \
+}
+
+typedef __u64 oid_t;
+
+/* define get_key_locality(), set_key_locality() */
+DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
+/* define get_key_type(), set_key_type() */
+DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
+/* define get_key_band(), set_key_band() */
+DEFINE_KEY_FIELD(band, BAND, __u64);
+/* define get_key_objectid(), set_key_objectid() */
+DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
+/* define get_key_fulloid(), set_key_fulloid() */
+DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
+/* define get_key_offset(), set_key_offset() */
+DEFINE_KEY_FIELD(offset, OFFSET, __u64);
+#if (REISER4_LARGE_KEY)
+/* define get_key_ordering(), set_key_ordering() */
+DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
+#else
+static inline __u64 get_key_ordering(const reiser4_key * key)
+{
+ return 0;
+}
+
+static inline void set_key_ordering(reiser4_key * key, __u64 val)
+{
+}
+#endif /* REISER4_LARGE_KEY */
+
+/* key comparison result */
+typedef enum {
+ LESS_THAN = -1, /* if first key is less than second */
+ EQUAL_TO = 0, /* if keys are equal */
+ GREATER_THAN = +1 /* if first key is greater than second */
+} cmp_t;
+
+void reiser4_key_init(reiser4_key * key);
+
+/* minimal possible key in the tree. Return pointer to the static storage. */
+extern const reiser4_key *reiser4_min_key(void);
+extern const reiser4_key *reiser4_max_key(void);
+
+/* helper macro for keycmp() */
+#define KEY_DIFF(k1, k2, field) \
+({ \
+ typeof(get_key_ ## field(k1)) f1; \
+ typeof(get_key_ ## field(k2)) f2; \
+ \
+ f1 = get_key_ ## field(k1); \
+ f2 = get_key_ ## field(k2); \
+ \
+ (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
+})
+
+/* helper macro for keycmp() */
+#define KEY_DIFF_EL(k1, k2, off) \
+({ \
+ __u64 e1; \
+ __u64 e2; \
+ \
+ e1 = get_key_el(k1, off); \
+ e2 = get_key_el(k2, off); \
+ \
+ (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
+})
+
+/**
+ * compare `k1' and `k2'. The following pair of functions is a heart of
+ * "key allocation policy". All you need to implement new policy is to
+ * add yet another clause here.
+ */
+static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */,
+ const reiser4_key * k2/* second key to compare */)
+{
+ cmp_t result;
+
+ /*
+ * This function is the heart of reiser4 tree-routines. Key comparison
+ * is among most heavily used operations in the file system.
+ */
+
+ assert("nikita-439", k1 != NULL);
+ assert("nikita-440", k2 != NULL);
+
+ /* there is no actual branch here: condition is compile time constant
+ * and constant folding and propagation ensures that only one branch
+ * is actually compiled in. */
+
+ /* In Plan-A and Plan-B schemes we compare type and locality at once,
+ because their physical order is identical with the logical one).
+ */
+ if (REISER4_PLANA_KEY_ALLOCATION) {
+ /* logical order of fields in plan-A:
+ locality->type->(ordering)->objectid->offset */
+
+ result = KEY_DIFF_EL(k1, k2, 0);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 1);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 2);
+ if (REISER4_LARGE_KEY && result == EQUAL_TO)
+ result = KEY_DIFF_EL(k1, k2, 3);
+ }
+ }
+ } else if (REISER4_PLANB_KEY_ALLOCATION) {
+ /* logical order of fields in plan-B:
+ locality->type->objectid->offset->(ordering) */
+
+ result = KEY_DIFF_EL(k1, k2, 0);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 2);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 3);
+ if (REISER4_LARGE_KEY && result == EQUAL_TO)
+ result = KEY_DIFF_EL(k1, k2, 1);
+ }
+ }
+ } else if (REISER4_3_5_KEY_ALLOCATION) {
+ /* Old good key allocation scheme from ReiserFS(v3)
+ FIXME: support it in Reiser4progs */
+
+ result = KEY_DIFF(k1, k2, locality);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF(k1, k2, objectid);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF(k1, k2, type);
+ if (result == EQUAL_TO)
+ result = KEY_DIFF(k1, k2, offset);
+ }
+ }
+ } else
+ impossible("nikita-441", "Unknown key allocation scheme!");
+ return result;
+}
+
+/**
+ * compare "sub-keys" of @k1 and @k2 (i.e. keys without the first component).
+ */
+static inline cmp_t short_keycmp(const reiser4_key *k1, const reiser4_key *k2)
+{
+ cmp_t result;
+
+ if (REISER4_PLANA_KEY_ALLOCATION) {
+ result = KEY_DIFF_EL(k1, k2, 1);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 2);
+ if (REISER4_LARGE_KEY && result == EQUAL_TO)
+ result = KEY_DIFF_EL(k1, k2, 3);
+ }
+ } else if (REISER4_PLANB_KEY_ALLOCATION) {
+ result = KEY_DIFF_EL(k1, k2, 2);
+ if (result == EQUAL_TO) {
+ result = KEY_DIFF_EL(k1, k2, 3);
+ if (REISER4_LARGE_KEY && result == EQUAL_TO)
+ result = KEY_DIFF_EL(k1, k2, 1);
+ }
+ } else
+ impossible("edward-2142", "Unsupported key allocation scheme!");
+ return result;
+}
+
+/* true if @k1 equals @k2 */
+static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
+ const reiser4_key * k2/* second key to compare */)
+{
+ assert("nikita-1879", k1 != NULL);
+ assert("nikita-1880", k2 != NULL);
+ return !memcmp(k1, k2, sizeof *k1);
+}
+
+/* true if @k1 is less than @k2 */
+static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
+ const reiser4_key * k2/* second key to compare */)
+{
+ assert("nikita-1952", k1 != NULL);
+ assert("nikita-1953", k2 != NULL);
+ return keycmp(k1, k2) == LESS_THAN;
+}
+
+/* true if @k1 is less than or equal to @k2 */
+static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
+ const reiser4_key * k2/* second key to compare */)
+{
+ assert("nikita-1954", k1 != NULL);
+ assert("nikita-1955", k2 != NULL);
+ return keycmp(k1, k2) != GREATER_THAN;
+}
+
+/* true if @k1 is greater than @k2 */
+static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
+ const reiser4_key * k2/* second key to compare */)
+{
+ assert("nikita-1959", k1 != NULL);
+ assert("nikita-1960", k2 != NULL);
+ return keycmp(k1, k2) == GREATER_THAN;
+}
+
+/* true if @k1 is greater than or equal to @k2 */
+static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
+ const reiser4_key * k2/* second key to compare */)
+{
+ assert("nikita-1956", k1 != NULL);
+ assert("nikita-1957", k2 != NULL); /* October 4: sputnik launched
+ * November 3: Laika */
+ return keycmp(k1, k2) != LESS_THAN;
+}
+
+static inline int all_but_ordering_keyeq(const reiser4_key * k1,
+ const reiser4_key * k2)
+{
+ return (get_key_locality(k1) == get_key_locality(k2) &&
+ get_key_type(k1) == get_key_type(k2) &&
+ get_key_fulloid(k1) == get_key_fulloid(k2) &&
+ get_key_offset(k1) == get_key_offset(k2));
+}
+
+static inline void prefetchkey(reiser4_key * key)
+{
+ prefetch(key);
+ prefetch(&key->el[KEY_CACHELINE_END]);
+}
+
+/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
+ 1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
+/* size of a buffer suitable to hold human readable key representation */
+#define KEY_BUF_LEN (80)
+
+#if REISER4_DEBUG
+extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
+#else
+#define reiser4_print_key(p, k) noop
+#endif
+
+/* __FS_REISERFS_KEY_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/ktxnmgrd.c linux-5.10.2/fs/reiser4/ktxnmgrd.c
--- linux-5.10.2.orig/fs/reiser4/ktxnmgrd.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/ktxnmgrd.c 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,213 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Transaction manager daemon. */
+
+/*
+ * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
+ * needed/important for the following reasons:
+ *
+ * 1. in reiser4 atom is not committed immediately when last transaction
+ * handle closes, unless atom is either too old or too large (see
+ * atom_should_commit()). This is done to avoid committing too frequently.
+ * because:
+ *
+ * 2. sometimes we don't want to commit atom when closing last transaction
+ * handle even if it is old and fat enough. For example, because we are at
+ * this point under directory semaphore, and committing would stall all
+ * accesses to this directory.
+ *
+ * ktxnmgrd binds its time sleeping on condition variable. When is awakes
+ * either due to (tunable) timeout or because it was explicitly woken up by
+ * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
+ * eligible.
+ *
+ */
+
+#include "debug.h"
+#include "txnmgr.h"
+#include "tree.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/sched.h> /* for struct task_struct */
+#include <linux/wait.h>
+#include <linux/suspend.h>
+#include <linux/kernel.h>
+#include <linux/writeback.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+static int scan_mgr(struct super_block *);
+
+/*
+ * change current->comm so that ps, top, and friends will see changed
+ * state. This serves no useful purpose whatsoever, but also costs nothing. May
+ * be it will make lonely system administrator feeling less alone at 3 A.M.
+ */
+#define set_comm(state) \
+ snprintf(current->comm, sizeof(current->comm), \
+ "%s:%s:%s", __FUNCTION__, (super)->s_id, (state))
+
+/**
+ * ktxnmgrd - kernel txnmgr daemon
+ * @arg: pointer to super block
+ *
+ * The background transaction manager daemon, started as a kernel thread during
+ * reiser4 initialization.
+ */
+static int ktxnmgrd(void *arg)
+{
+ struct super_block *super;
+ ktxnmgrd_context *ctx;
+ txn_mgr *mgr;
+ int done = 0;
+
+ super = arg;
+ mgr = &get_super_private(super)->tmgr;
+
+ /*
+ * do_fork() just copies task_struct into the new thread. ->fs_context
+ * shouldn't be copied of course. This shouldn't be a problem for the
+ * rest of the code though.
+ */
+ current->journal_info = NULL;
+ ctx = mgr->daemon;
+ while (1) {
+ try_to_freeze();
+ set_comm("wait");
+ {
+ DEFINE_WAIT(__wait);
+
+ prepare_to_wait(&ctx->wait, &__wait,
+ TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ done = 1;
+ else
+ schedule_timeout(ctx->timeout);
+ finish_wait(&ctx->wait, &__wait);
+ }
+ if (done)
+ break;
+ set_comm("run");
+ spin_lock(&ctx->guard);
+ /*
+ * wait timed out or ktxnmgrd was woken up by explicit request
+ * to commit something. Scan list of atoms in txnmgr and look
+ * for too old atoms.
+ */
+ do {
+ ctx->rescan = 0;
+ scan_mgr(super);
+ spin_lock(&ctx->guard);
+ if (ctx->rescan) {
+ /*
+ * the list could be modified while ctx
+ * spinlock was released, we have to repeat
+ * scanning from the beginning
+ */
+ break;
+ }
+ } while (ctx->rescan);
+ spin_unlock(&ctx->guard);
+ }
+ return 0;
+}
+
+#undef set_comm
+
+/**
+ * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
+ * @super: pointer to super block
+ *
+ * Allocates and initializes ktxnmgrd_context, attaches it to transaction
+ * manager. Starts kernel txnmgr daemon. This is called on mount.
+ */
+int reiser4_init_ktxnmgrd(struct super_block *super)
+{
+ txn_mgr *mgr;
+ ktxnmgrd_context *ctx;
+
+ mgr = &get_super_private(super)->tmgr;
+
+ assert("zam-1014", mgr->daemon == NULL);
+
+ ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
+ if (!ctx)
+ return RETERR(-ENOMEM);
+
+ assert("nikita-2442", ctx != NULL);
+
+ init_waitqueue_head(&ctx->wait);
+
+ /*kcond_init(&ctx->startup);*/
+ spin_lock_init(&ctx->guard);
+ ctx->timeout = REISER4_TXNMGR_TIMEOUT;
+ ctx->rescan = 1;
+ mgr->daemon = ctx;
+
+ ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
+ if (IS_ERR(ctx->tsk)) {
+ int ret = PTR_ERR(ctx->tsk);
+ mgr->daemon = NULL;
+ kfree(ctx);
+ return RETERR(ret);
+ }
+ return 0;
+}
+
+void ktxnmgrd_kick(txn_mgr *mgr)
+{
+ assert("nikita-3234", mgr != NULL);
+ assert("nikita-3235", mgr->daemon != NULL);
+ wake_up(&mgr->daemon->wait);
+}
+
+int is_current_ktxnmgrd(void)
+{
+ return (get_current_super_private()->tmgr.daemon->tsk == current);
+}
+
+/**
+ * scan_mgr - commit atoms which are to be committed
+ * @super: super block to commit atoms of
+ *
+ * Commits old atoms.
+ */
+static int scan_mgr(struct super_block *super)
+{
+ int ret;
+ reiser4_context ctx;
+
+ init_stack_context(&ctx, super);
+ ret = commit_some_atoms(&get_super_private(super)->tmgr);
+ reiser4_exit_context(&ctx);
+ return ret;
+}
+
+/**
+ * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
+ * @mgr:
+ *
+ * This is called on umount. Stops ktxnmgrd and free t
+ */
+void reiser4_done_ktxnmgrd(struct super_block *super)
+{
+ txn_mgr *mgr;
+
+ mgr = &get_super_private(super)->tmgr;
+ assert("zam-1012", mgr->daemon != NULL);
+
+ kthread_stop(mgr->daemon->tsk);
+ kfree(mgr->daemon);
+ mgr->daemon = NULL;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/ktxnmgrd.h linux-5.10.2/fs/reiser4/ktxnmgrd.h
--- linux-5.10.2.orig/fs/reiser4/ktxnmgrd.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/ktxnmgrd.h 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,52 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Transaction manager daemon. See ktxnmgrd.c for comments. */
+
+#ifndef __KTXNMGRD_H__
+#define __KTXNMGRD_H__
+
+#include "txnmgr.h"
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/completion.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/sched.h> /* for struct task_struct */
+
+/* in this structure all data necessary to start up, shut down and communicate
+ * with ktxnmgrd are kept. */
+struct ktxnmgrd_context {
+ /* wait queue head on which ktxnmgrd sleeps */
+ wait_queue_head_t wait;
+ /* spin lock protecting all fields of this structure */
+ spinlock_t guard;
+ /* timeout of sleeping on ->wait */
+ signed long timeout;
+ /* kernel thread running ktxnmgrd */
+ struct task_struct *tsk;
+ /* list of all file systems served by this ktxnmgrd */
+ struct list_head queue;
+ /* should ktxnmgrd repeat scanning of atoms? */
+ unsigned int rescan:1;
+};
+
+extern int reiser4_init_ktxnmgrd(struct super_block *);
+extern void reiser4_done_ktxnmgrd(struct super_block *);
+
+extern void ktxnmgrd_kick(txn_mgr * mgr);
+extern int is_current_ktxnmgrd(void);
+
+/* __KTXNMGRD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/lock.c linux-5.10.2/fs/reiser4/lock.c
--- linux-5.10.2.orig/fs/reiser4/lock.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/lock.c 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,1237 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
+ order. V4 balances the tree from the bottom up, and searches the tree from
+ the top down, and that is really the way we want it, so tradition won't work
+ for us.
+
+ Instead we have two lock orderings, a high priority lock ordering, and a low
+ priority lock ordering. Each node in the tree has a lock in its znode.
+
+ Suppose we have a set of processes which lock (R/W) tree nodes. Each process
+ has a set (maybe empty) of already locked nodes ("process locked set"). Each
+ process may have a pending lock request to a node locked by another process.
+ Note: we lock and unlock, but do not transfer locks: it is possible
+ transferring locks instead would save some bus locking....
+
+ Deadlock occurs when we have a loop constructed from process locked sets and
+ lock request vectors.
+
+ NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
+ memory is extended with "znodes" with which we connect nodes with their left
+ and right neighbors using sibling pointers stored in the znodes. When we
+ perform balancing operations we often go from left to right and from right to
+ left.
+
+ +-P1-+ +-P3-+
+ |+--+| V1 |+--+|
+ ||N1|| -------> ||N3||
+ |+--+| |+--+|
+ +----+ +----+
+ ^ |
+ |V2 |V3
+ | v
+ +---------P2---------+
+ |+--+ +--+|
+ ||N2| -------- |N4||
+ |+--+ +--+|
+ +--------------------+
+
+ We solve this by ensuring that only low priority processes lock in top to
+ bottom order and from right to left, and high priority processes lock from
+ bottom to top and left to right.
+
+ ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
+ kill those damn busy loops.
+ ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
+ stage) cannot be ordered that way. There are no rules what nodes can belong
+ to the atom and what nodes cannot. We cannot define what is right or left
+ direction, what is top or bottom. We can take immediate parent or side
+ neighbor of one node, but nobody guarantees that, say, left neighbor node is
+ not a far right neighbor for other nodes from the same atom. It breaks
+ deadlock avoidance rules and hi-low priority locking cannot be applied for
+ atom locks.
+
+ How does it help to avoid deadlocks ?
+
+ Suppose we have a deadlock with n processes. Processes from one priority
+ class never deadlock because they take locks in one consistent
+ order.
+
+ So, any possible deadlock loop must have low priority as well as high
+ priority processes. There are no other lock priority levels except low and
+ high. We know that any deadlock loop contains at least one node locked by a
+ low priority process and requested by a high priority process. If this
+ situation is caught and resolved it is sufficient to avoid deadlocks.
+
+ V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
+
+ The deadlock prevention algorithm is based on comparing
+ priorities of node owners (processes which keep znode locked) and
+ requesters (processes which want to acquire a lock on znode). We
+ implement a scheme where low-priority owners yield locks to
+ high-priority requesters. We created a signal passing system that
+ is used to ask low-priority processes to yield one or more locked
+ znodes.
+
+ The condition when a znode needs to change its owners is described by the
+ following formula:
+
+ #############################################
+ # #
+ # (number of high-priority requesters) > 0 #
+ # AND #
+ # (numbers of high-priority owners) == 0 #
+ # #
+ #############################################
+
+ Note that a low-priority process delays node releasing if another
+ high-priority process owns this node. So, slightly more strictly speaking,
+ to have a deadlock capable cycle you must have a loop in which a high
+ priority process is waiting on a low priority process to yield a node, which
+ is slightly different from saying a high priority process is waiting on a
+ node owned by a low priority process.
+
+ It is enough to avoid deadlocks if we prevent any low-priority process from
+ falling asleep if its locked set contains a node which satisfies the
+ deadlock condition.
+
+ That condition is implicitly or explicitly checked in all places where new
+ high-priority requests may be added or removed from node request queue or
+ high-priority process takes or releases a lock on node. The main
+ goal of these checks is to never lose the moment when node becomes "has
+ wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
+ at that time.
+
+ The information about received signals is stored in the per-process
+ structure (lock stack) and analyzed before a low-priority process goes to
+ sleep but after a "fast" attempt to lock a node fails. Any signal wakes
+ sleeping process up and forces him to re-check lock status and received
+ signal info. If "must-yield-this-lock" signals were received the locking
+ primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
+
+ V4 LOCKING DRAWBACKS
+
+ If we have already balanced on one level, and we are propagating our changes
+ upward to a higher level, it could be very messy to surrender all locks on
+ the lower level because we put so much computational work into it, and
+ reverting them to their state before they were locked might be very complex.
+ We also don't want to acquire all locks before performing balancing because
+ that would either be almost as much work as the balancing, or it would be
+ too conservative and lock too much. We want balancing to be done only at
+ high priority. Yet, we might want to go to the left one node and use some
+ of its empty space... So we make one attempt at getting the node to the left
+ using try_lock, and if it fails we do without it, because we didn't really
+ need it, it was only a nice to have.
+
+ LOCK STRUCTURES DESCRIPTION
+
+ The following data structures are used in the reiser4 locking
+ implementation:
+
+ All fields related to long-term locking are stored in znode->lock.
+
+ The lock stack is a per thread object. It owns all znodes locked by the
+ thread. One znode may be locked by several threads in case of read lock or
+ one znode may be write locked by one thread several times. The special link
+ objects (lock handles) support n<->m relation between znodes and lock
+ owners.
+
+ <Thread 1> <Thread 2>
+
+ +---------+ +---------+
+ | LS1 | | LS2 |
+ +---------+ +---------+
+ ^ ^
+ |---------------+ +----------+
+ v v v v
+ +---------+ +---------+ +---------+ +---------+
+ | LH1 | | LH2 | | LH3 | | LH4 |
+ +---------+ +---------+ +---------+ +---------+
+ ^ ^ ^ ^
+ | +------------+ |
+ v v v
+ +---------+ +---------+ +---------+
+ | Z1 | | Z2 | | Z3 |
+ +---------+ +---------+ +---------+
+
+ Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
+ picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
+ LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it. Znode
+ Z1 is locked by only one thread, znode has only one lock handle LH1 on its
+ list, similar situation is for Z3 which is locked by the thread 2 only. Z2
+ is locked (for read) twice by different threads and two lock handles are on
+ its list. Each lock handle represents a single relation of a locking of a
+ znode by a thread. Locking of a znode is an establishing of a locking
+ relation between the lock stack and the znode by adding of a new lock handle
+ to a list of lock handles, the lock stack. The lock stack links all lock
+ handles for all znodes locked by the lock stack. The znode list groups all
+ lock handles for all locks stacks which locked the znode.
+
+ Yet another relation may exist between znode and lock owners. If lock
+ procedure cannot immediately take lock on an object it adds the lock owner
+ on special `requestors' list belongs to znode. That list represents a
+ queue of pending lock requests. Because one lock owner may request only
+ only one lock object at a time, it is a 1->n relation between lock objects
+ and a lock owner implemented as it is described above. Full information
+ (priority, pointers to lock and link objects) about each lock request is
+ stored in lock owner structure in `request' field.
+
+ SHORT_TERM LOCKING
+
+ This is a list of primitive operations over lock stacks / lock handles /
+ znodes and locking descriptions for them.
+
+ 1. locking / unlocking which is done by two list insertion/deletion, one
+ to/from znode's list of lock handles, another one is to/from lock stack's
+ list of lock handles. The first insertion is protected by
+ znode->lock.guard spinlock. The list owned by the lock stack can be
+ modified only by thread who owns the lock stack and nobody else can
+ modify/read it. There is nothing to be protected by a spinlock or
+ something else.
+
+ 2. adding/removing a lock request to/from znode requesters list. The rule is
+ that znode->lock.guard spinlock should be taken for this.
+
+ 3. we can traverse list of lock handles and use references to lock stacks who
+ locked given znode if znode->lock.guard spinlock is taken.
+
+ 4. If a lock stack is associated with a znode as a lock requestor or lock
+ owner its existence is guaranteed by znode->lock.guard spinlock. Some its
+ (lock stack's) fields should be protected from being accessed in parallel
+ by two or more threads. Please look at lock_stack structure definition
+ for the info how those fields are protected. */
+
+/* Znode lock and capturing intertwining. */
+/* In current implementation we capture formatted nodes before locking
+ them. Take a look on longterm lock znode, reiser4_try_capture() request
+ precedes locking requests. The longterm_lock_znode function unconditionally
+ captures znode before even checking of locking conditions.
+
+ Another variant is to capture znode after locking it. It was not tested, but
+ at least one deadlock condition is supposed to be there. One thread has
+ locked a znode (Node-1) and calls reiser4_try_capture() for it.
+ reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
+ Second thread is a flushing thread, its current atom is the atom Node-1
+ belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
+ is locked by the first thread. The described situation is a deadlock. */
+
+#include "debug.h"
+#include "txnmgr.h"
+#include "znode.h"
+#include "jnode.h"
+#include "tree.h"
+#include "plugin/node/node.h"
+#include "super.h"
+
+#include <linux/spinlock.h>
+
+#if REISER4_DEBUG
+static int request_is_deadlock_safe(znode * , znode_lock_mode,
+ znode_lock_request);
+#endif
+
+/* Returns a lock owner associated with current thread */
+lock_stack *get_current_lock_stack(void)
+{
+ return &get_current_context()->stack;
+}
+
+/* Wakes up all low priority owners informing them about possible deadlock */
+static void wake_up_all_lopri_owners(znode * node)
+{
+ lock_handle *handle;
+
+ assert_spin_locked(&(node->lock.guard));
+ list_for_each_entry(handle, &node->lock.owners, owners_link) {
+ assert("nikita-1832", handle->node == node);
+ /* count this signal in owner->nr_signaled */
+ if (!handle->signaled) {
+ handle->signaled = 1;
+ atomic_inc(&handle->owner->nr_signaled);
+ /* Wake up a single process */
+ reiser4_wake_up(handle->owner);
+ }
+ }
+}
+
+/* Adds a lock to a lock owner, which means creating a link to the lock and
+ putting the link into the two lists all links are on (the doubly linked list
+ that forms the lock_stack, and the doubly linked list of links attached
+ to a lock.
+*/
+static inline void
+link_object(lock_handle * handle, lock_stack * owner, znode * node)
+{
+ assert("jmacd-810", handle->owner == NULL);
+ assert_spin_locked(&(node->lock.guard));
+
+ handle->owner = owner;
+ handle->node = node;
+
+ assert("reiser4-4",
+ ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
+
+ /* add lock handle to the end of lock_stack's list of locks */
+ list_add_tail(&handle->locks_link, &owner->locks);
+ ON_DEBUG(owner->nr_locks++);
+ reiser4_ctx_gfp_mask_set();
+
+ /* add lock handle to the head of znode's list of owners */
+ list_add(&handle->owners_link, &node->lock.owners);
+ handle->signaled = 0;
+}
+
+/* Breaks a relation between a lock and its owner */
+static inline void unlink_object(lock_handle * handle)
+{
+ assert("zam-354", handle->owner != NULL);
+ assert("nikita-1608", handle->node != NULL);
+ assert_spin_locked(&(handle->node->lock.guard));
+ assert("nikita-1829", handle->owner == get_current_lock_stack());
+ assert("reiser4-5", handle->owner->nr_locks > 0);
+
+ /* remove lock handle from lock_stack's list of locks */
+ list_del(&handle->locks_link);
+ ON_DEBUG(handle->owner->nr_locks--);
+ reiser4_ctx_gfp_mask_set();
+ assert("reiser4-6",
+ ergo(list_empty_careful(&handle->owner->locks),
+ handle->owner->nr_locks == 0));
+ /* remove lock handle from znode's list of owners */
+ list_del(&handle->owners_link);
+ /* indicates that lock handle is free now */
+ handle->node = NULL;
+#if REISER4_DEBUG
+ INIT_LIST_HEAD(&handle->locks_link);
+ INIT_LIST_HEAD(&handle->owners_link);
+ handle->owner = NULL;
+#endif
+}
+
+/* Actually locks an object knowing that we are able to do this */
+static void lock_object(lock_stack * owner)
+{
+ struct lock_request *request;
+ znode *node;
+
+ request = &owner->request;
+ node = request->node;
+ assert_spin_locked(&(node->lock.guard));
+ if (request->mode == ZNODE_READ_LOCK) {
+ node->lock.nr_readers++;
+ } else {
+ /* check that we don't switched from read to write lock */
+ assert("nikita-1840", node->lock.nr_readers <= 0);
+ /* We allow recursive locking; a node can be locked several
+ times for write by same process */
+ node->lock.nr_readers--;
+ }
+
+ link_object(request->handle, owner, node);
+
+ if (owner->curpri)
+ node->lock.nr_hipri_owners++;
+}
+
+/* Check for recursive write locking */
+static int recursive(lock_stack * owner)
+{
+ int ret;
+ znode *node;
+ lock_handle *lh;
+
+ node = owner->request.node;
+
+ /* Owners list is not empty for a locked node */
+ assert("zam-314", !list_empty_careful(&node->lock.owners));
+ assert("nikita-1841", owner == get_current_lock_stack());
+ assert_spin_locked(&(node->lock.guard));
+
+ lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
+ ret = (lh->owner == owner);
+
+ /* Recursive read locking should be done usual way */
+ assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
+ /* mixing of read/write locks is not allowed */
+ assert("zam-341", !ret || znode_is_wlocked(node));
+
+ return ret;
+}
+
+#if REISER4_DEBUG
+/* Returns true if the lock is held by the calling thread. */
+int znode_is_any_locked(const znode * node)
+{
+ lock_handle *handle;
+ lock_stack *stack;
+ int ret;
+
+ if (!znode_is_locked(node))
+ return 0;
+
+ stack = get_current_lock_stack();
+
+ spin_lock_stack(stack);
+
+ ret = 0;
+
+ list_for_each_entry(handle, &stack->locks, locks_link) {
+ if (handle->node == node) {
+ ret = 1;
+ break;
+ }
+ }
+
+ spin_unlock_stack(stack);
+
+ return ret;
+}
+
+#endif
+
+/* Returns true if a write lock is held by the calling thread. */
+int znode_is_write_locked(const znode * node)
+{
+ lock_stack *stack;
+ lock_handle *handle;
+
+ assert("jmacd-8765", node != NULL);
+
+ if (!znode_is_wlocked(node))
+ return 0;
+
+ stack = get_current_lock_stack();
+
+ /*
+ * When znode is write locked, all owner handles point to the same lock
+ * stack. Get pointer to lock stack from the first lock handle from
+ * znode's owner list
+ */
+ handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
+
+ return (handle->owner == stack);
+}
+
+/* This "deadlock" condition is the essential part of reiser4 locking
+ implementation. This condition is checked explicitly by calling
+ check_deadlock_condition() or implicitly in all places where znode lock
+ state (set of owners and request queue) is changed. Locking code is
+ designed to use this condition to trigger procedure of passing object from
+ low priority owner(s) to high priority one(s).
+
+ The procedure results in passing an event (setting lock_handle->signaled
+ flag) and counting this event in nr_signaled field of owner's lock stack
+ object and wakeup owner's process.
+*/
+static inline int check_deadlock_condition(znode * node)
+{
+ assert_spin_locked(&(node->lock.guard));
+ return node->lock.nr_hipri_requests > 0
+ && node->lock.nr_hipri_owners == 0;
+}
+
+static int check_livelock_condition(znode * node, znode_lock_mode mode)
+{
+ zlock * lock = &node->lock;
+
+ return mode == ZNODE_READ_LOCK &&
+ lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
+}
+
+/* checks lock/request compatibility */
+static int can_lock_object(lock_stack * owner)
+{
+ znode *node = owner->request.node;
+
+ assert_spin_locked(&(node->lock.guard));
+
+ /* See if the node is disconnected. */
+ if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
+ return RETERR(-EINVAL);
+
+ /* Do not ever try to take a lock if we are going in low priority
+ direction and a node have a high priority request without high
+ priority owners. */
+ if (unlikely(!owner->curpri && check_deadlock_condition(node)))
+ return RETERR(-E_REPEAT);
+ if (unlikely(owner->curpri &&
+ check_livelock_condition(node, owner->request.mode)))
+ return RETERR(-E_REPEAT);
+ if (unlikely(!is_lock_compatible(node, owner->request.mode)))
+ return RETERR(-E_REPEAT);
+ return 0;
+}
+
+/* Setting of a high priority to the process. It clears "signaled" flags
+ because znode locked by high-priority process can't satisfy our "deadlock
+ condition". */
+static void set_high_priority(lock_stack * owner)
+{
+ assert("nikita-1846", owner == get_current_lock_stack());
+ /* Do nothing if current priority is already high */
+ if (!owner->curpri) {
+ /* We don't need locking for owner->locks list, because, this
+ * function is only called with the lock stack of the current
+ * thread, and no other thread can play with owner->locks list
+ * and/or change ->node pointers of lock handles in this list.
+ *
+ * (Interrupts also are not involved.)
+ */
+ lock_handle *item = list_entry(owner->locks.next, lock_handle,
+ locks_link);
+ while (&owner->locks != &item->locks_link) {
+ znode *node = item->node;
+
+ spin_lock_zlock(&node->lock);
+
+ node->lock.nr_hipri_owners++;
+
+ /* we can safely set signaled to zero, because
+ previous statement (nr_hipri_owners ++) guarantees
+ that signaled will be never set again. */
+ item->signaled = 0;
+ spin_unlock_zlock(&node->lock);
+
+ item = list_entry(item->locks_link.next, lock_handle,
+ locks_link);
+ }
+ owner->curpri = 1;
+ atomic_set(&owner->nr_signaled, 0);
+ }
+}
+
+/* Sets a low priority to the process. */
+static void set_low_priority(lock_stack * owner)
+{
+ assert("nikita-3075", owner == get_current_lock_stack());
+ /* Do nothing if current priority is already low */
+ if (owner->curpri) {
+ /* scan all locks (lock handles) held by @owner, which is
+ actually current thread, and check whether we are reaching
+ deadlock possibility anywhere.
+ */
+ lock_handle *handle = list_entry(owner->locks.next, lock_handle,
+ locks_link);
+ while (&owner->locks != &handle->locks_link) {
+ znode *node = handle->node;
+ spin_lock_zlock(&node->lock);
+ /* this thread just was hipri owner of @node, so
+ nr_hipri_owners has to be greater than zero. */
+ assert("nikita-1835", node->lock.nr_hipri_owners > 0);
+ node->lock.nr_hipri_owners--;
+ /* If we have deadlock condition, adjust a nr_signaled
+ field. It is enough to set "signaled" flag only for
+ current process, other low-pri owners will be
+ signaled and waken up after current process unlocks
+ this object and any high-priority requestor takes
+ control. */
+ if (check_deadlock_condition(node)
+ && !handle->signaled) {
+ handle->signaled = 1;
+ atomic_inc(&owner->nr_signaled);
+ }
+ spin_unlock_zlock(&node->lock);
+ handle = list_entry(handle->locks_link.next,
+ lock_handle, locks_link);
+ }
+ owner->curpri = 0;
+ }
+}
+
+static void remove_lock_request(lock_stack * requestor)
+{
+ zlock * lock = &requestor->request.node->lock;
+
+ if (requestor->curpri) {
+ assert("nikita-1838", lock->nr_hipri_requests > 0);
+ lock->nr_hipri_requests--;
+ if (requestor->request.mode == ZNODE_WRITE_LOCK)
+ lock->nr_hipri_write_requests--;
+ }
+ list_del(&requestor->requestors_link);
+}
+
+static void invalidate_all_lock_requests(znode * node)
+{
+ lock_stack *requestor, *tmp;
+
+ assert_spin_locked(&(node->lock.guard));
+
+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
+ requestors_link) {
+ remove_lock_request(requestor);
+ requestor->request.ret_code = -EINVAL;
+ reiser4_wake_up(requestor);
+ requestor->request.mode = ZNODE_NO_LOCK;
+ }
+}
+
+static void dispatch_lock_requests(znode * node)
+{
+ lock_stack *requestor, *tmp;
+
+ assert_spin_locked(&(node->lock.guard));
+
+ list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
+ requestors_link) {
+ if (znode_is_write_locked(node))
+ break;
+ if (!can_lock_object(requestor)) {
+ lock_object(requestor);
+ remove_lock_request(requestor);
+ requestor->request.ret_code = 0;
+ reiser4_wake_up(requestor);
+ requestor->request.mode = ZNODE_NO_LOCK;
+ }
+ }
+}
+
+/* release long-term lock, acquired by longterm_lock_znode() */
+void longterm_unlock_znode(lock_handle * handle)
+{
+ znode *node = handle->node;
+ lock_stack *oldowner = handle->owner;
+ int hipri;
+ int readers;
+ int rdelta;
+ int youdie;
+
+ /*
+ * this is time-critical and highly optimized code. Modify carefully.
+ */
+
+ assert("jmacd-1021", handle != NULL);
+ assert("jmacd-1022", handle->owner != NULL);
+ assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
+
+ assert("zam-130", oldowner == get_current_lock_stack());
+
+ LOCK_CNT_DEC(long_term_locked_znode);
+
+ /*
+ * to minimize amount of operations performed under lock, pre-compute
+ * all variables used within critical section. This makes code
+ * obscure.
+ */
+
+ /* was this lock of hi or lo priority */
+ hipri = oldowner->curpri ? 1 : 0;
+ /* number of readers */
+ readers = node->lock.nr_readers;
+ /* +1 if write lock, -1 if read lock */
+ rdelta = (readers > 0) ? -1 : +1;
+ /* true if node is to die and write lock is released */
+ youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
+
+ spin_lock_zlock(&node->lock);
+
+ assert("zam-101", znode_is_locked(node));
+
+ /* Adjust a number of high priority owners of this lock */
+ assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
+ node->lock.nr_hipri_owners -= hipri;
+
+ /* Handle znode deallocation on last write-lock release. */
+ if (znode_is_wlocked_once(node)) {
+ if (youdie) {
+ forget_znode(handle);
+ assert("nikita-2191", znode_invariant(node));
+ zput(node);
+ return;
+ }
+ }
+
+ if (handle->signaled)
+ atomic_dec(&oldowner->nr_signaled);
+
+ /* Unlocking means owner<->object link deletion */
+ unlink_object(handle);
+
+ /* This is enough to be sure whether an object is completely
+ unlocked. */
+ node->lock.nr_readers += rdelta;
+
+ /* If the node is locked it must have an owners list. Likewise, if
+ the node is unlocked it must have an empty owners list. */
+ assert("zam-319", equi(znode_is_locked(node),
+ !list_empty_careful(&node->lock.owners)));
+
+#if REISER4_DEBUG
+ if (!znode_is_locked(node))
+ ++node->times_locked;
+#endif
+
+ /* If there are pending lock requests we wake up a requestor */
+ if (!znode_is_wlocked(node))
+ dispatch_lock_requests(node);
+ if (check_deadlock_condition(node))
+ wake_up_all_lopri_owners(node);
+ spin_unlock_zlock(&node->lock);
+
+ /* minus one reference from handle->node */
+ assert("nikita-2190", znode_invariant(node));
+ ON_DEBUG(check_lock_data());
+ ON_DEBUG(check_lock_node_data(node));
+ zput(node);
+}
+
+/* final portion of longterm-lock */
+static int
+lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
+{
+ znode *node = owner->request.node;
+
+ assert_spin_locked(&(node->lock.guard));
+
+ /* If we broke with (ok == 0) it means we can_lock, now do it. */
+ if (ok == 0) {
+ lock_object(owner);
+ owner->request.mode = 0;
+ /* count a reference from lockhandle->node
+
+ znode was already referenced at the entry to this function,
+ hence taking spin-lock here is not necessary (see comment
+ in the zref()).
+ */
+ zref(node);
+
+ LOCK_CNT_INC(long_term_locked_znode);
+ }
+ spin_unlock_zlock(&node->lock);
+ ON_DEBUG(check_lock_data());
+ ON_DEBUG(check_lock_node_data(node));
+ return ok;
+}
+
+/*
+ * version of longterm_znode_lock() optimized for the most common case: read
+ * lock without any special flags. This is the kind of lock that any tree
+ * traversal takes on the root node of the tree, which is very frequent.
+ */
+static int longterm_lock_tryfast(lock_stack * owner)
+{
+ int result;
+ znode *node;
+ zlock *lock;
+
+ node = owner->request.node;
+ lock = &node->lock;
+
+ assert("nikita-3340", reiser4_schedulable());
+ assert("nikita-3341", request_is_deadlock_safe(node,
+ ZNODE_READ_LOCK,
+ ZNODE_LOCK_LOPRI));
+ spin_lock_zlock(lock);
+ result = can_lock_object(owner);
+ spin_unlock_zlock(lock);
+
+ if (likely(result != -EINVAL)) {
+ spin_lock_znode(node);
+ result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
+ spin_unlock_znode(node);
+ spin_lock_zlock(lock);
+ if (unlikely(result != 0)) {
+ owner->request.mode = 0;
+ } else {
+ result = can_lock_object(owner);
+ if (unlikely(result == -E_REPEAT)) {
+ /* fall back to longterm_lock_znode() */
+ spin_unlock_zlock(lock);
+ return 1;
+ }
+ }
+ return lock_tail(owner, result, ZNODE_READ_LOCK);
+ } else
+ return 1;
+}
+
+/* locks given lock object */
+int longterm_lock_znode(
+ /* local link object (allocated by lock owner
+ * thread, usually on its own stack) */
+ lock_handle * handle,
+ /* znode we want to lock. */
+ znode * node,
+ /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
+ znode_lock_mode mode,
+ /* {0, -EINVAL, -E_DEADLOCK}, see return codes
+ description. */
+ znode_lock_request request) {
+ int ret;
+ int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
+ int non_blocking = 0;
+ int has_atom;
+ txn_capture cap_flags;
+ zlock *lock;
+ txn_handle *txnh;
+ tree_level level;
+
+ /* Get current process context */
+ lock_stack *owner = get_current_lock_stack();
+
+ /* Check that the lock handle is initialized and isn't already being
+ * used. */
+ assert("jmacd-808", handle->owner == NULL);
+ assert("nikita-3026", reiser4_schedulable());
+ assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
+ assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
+ /* long term locks are not allowed in the VM contexts (->writepage(),
+ * prune_{d,i}cache()).
+ *
+ * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
+ * bug caused by d_splice_alias() only working for directories.
+ */
+ assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
+ assert("zam-1055", mode != ZNODE_NO_LOCK);
+
+ cap_flags = 0;
+ if (request & ZNODE_LOCK_NONBLOCK) {
+ cap_flags |= TXN_CAPTURE_NONBLOCKING;
+ non_blocking = 1;
+ }
+
+ if (request & ZNODE_LOCK_DONT_FUSE)
+ cap_flags |= TXN_CAPTURE_DONT_FUSE;
+
+ /* If we are changing our process priority we must adjust a number
+ of high priority owners for each znode that we already lock */
+ if (hipri) {
+ set_high_priority(owner);
+ } else {
+ set_low_priority(owner);
+ }
+
+ level = znode_get_level(node);
+
+ /* Fill request structure with our values. */
+ owner->request.mode = mode;
+ owner->request.handle = handle;
+ owner->request.node = node;
+
+ txnh = get_current_context()->trans;
+ lock = &node->lock;
+
+ if (mode == ZNODE_READ_LOCK && request == 0) {
+ ret = longterm_lock_tryfast(owner);
+ if (ret <= 0)
+ return ret;
+ }
+
+ has_atom = (txnh->atom != NULL);
+
+ /* Synchronize on node's zlock guard lock. */
+ spin_lock_zlock(lock);
+
+ if (znode_is_locked(node) &&
+ mode == ZNODE_WRITE_LOCK && recursive(owner))
+ return lock_tail(owner, 0, mode);
+
+ for (;;) {
+ /* Check the lock's availability: if it is unavaiable we get
+ E_REPEAT, 0 indicates "can_lock", otherwise the node is
+ invalid. */
+ ret = can_lock_object(owner);
+
+ if (unlikely(ret == -EINVAL)) {
+ /* @node is dying. Leave it alone. */
+ break;
+ }
+
+ if (unlikely(ret == -E_REPEAT && non_blocking)) {
+ /* either locking of @node by the current thread will
+ * lead to the deadlock, or lock modes are
+ * incompatible. */
+ break;
+ }
+
+ assert("nikita-1844", (ret == 0)
+ || ((ret == -E_REPEAT) && !non_blocking));
+ /* If we can get the lock... Try to capture first before
+ taking the lock. */
+
+ /* first handle commonest case where node and txnh are already
+ * in the same atom. */
+ /* safe to do without taking locks, because:
+ *
+ * 1. read of aligned word is atomic with respect to writes to
+ * this word
+ *
+ * 2. false negatives are handled in reiser4_try_capture().
+ *
+ * 3. false positives are impossible.
+ *
+ * PROOF: left as an exercise to the curious reader.
+ *
+ * Just kidding. Here is one:
+ *
+ * At the time T0 txnh->atom is stored in txnh_atom.
+ *
+ * At the time T1 node->atom is stored in node_atom.
+ *
+ * At the time T2 we observe that
+ *
+ * txnh_atom != NULL && node_atom == txnh_atom.
+ *
+ * Imagine that at this moment we acquire node and txnh spin
+ * lock in this order. Suppose that under spin lock we have
+ *
+ * node->atom != txnh->atom, (S1)
+ *
+ * at the time T3.
+ *
+ * txnh->atom != NULL still, because txnh is open by the
+ * current thread.
+ *
+ * Suppose node->atom == NULL, that is, node was un-captured
+ * between T1, and T3. But un-capturing of formatted node is
+ * always preceded by the call to reiser4_invalidate_lock(),
+ * which marks znode as JNODE_IS_DYING under zlock spin
+ * lock. Contradiction, because can_lock_object() above checks
+ * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
+ *
+ * Suppose that node->atom != node_atom, that is, atom, node
+ * belongs to was fused into another atom: node_atom was fused
+ * into node->atom. Atom of txnh was equal to node_atom at T2,
+ * which means that under spin lock, txnh->atom == node->atom,
+ * because txnh->atom can only follow fusion
+ * chain. Contradicts S1.
+ *
+ * The same for hypothesis txnh->atom != txnh_atom. Hence,
+ * node->atom == node_atom == txnh_atom == txnh->atom. Again
+ * contradicts S1. Hence S1 is false. QED.
+ *
+ */
+
+ if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
+ ;
+ } else {
+ /*
+ * unlock zlock spin lock here. It is possible for
+ * longterm_unlock_znode() to sneak in here, but there
+ * is no harm: reiser4_invalidate_lock() will mark znode
+ * as JNODE_IS_DYING and this will be noted by
+ * can_lock_object() below.
+ */
+ spin_unlock_zlock(lock);
+ spin_lock_znode(node);
+ ret = reiser4_try_capture(ZJNODE(node), mode,
+ cap_flags);
+ spin_unlock_znode(node);
+ spin_lock_zlock(lock);
+ if (unlikely(ret != 0)) {
+ /* In the failure case, the txnmgr releases
+ the znode's lock (or in some cases, it was
+ released a while ago). There's no need to
+ reacquire it so we should return here,
+ avoid releasing the lock. */
+ owner->request.mode = 0;
+ break;
+ }
+
+ /* Check the lock's availability again -- this is
+ because under some circumstances the capture code
+ has to release and reacquire the znode spinlock. */
+ ret = can_lock_object(owner);
+ }
+
+ /* This time, a return of (ret == 0) means we can lock, so we
+ should break out of the loop. */
+ if (likely(ret != -E_REPEAT || non_blocking))
+ break;
+
+ /* Lock is unavailable, we have to wait. */
+ ret = reiser4_prepare_to_sleep(owner);
+ if (unlikely(ret != 0))
+ break;
+
+ assert_spin_locked(&(node->lock.guard));
+ if (hipri) {
+ /* If we are going in high priority direction then
+ increase high priority requests counter for the
+ node */
+ lock->nr_hipri_requests++;
+ if (mode == ZNODE_WRITE_LOCK)
+ lock->nr_hipri_write_requests++;
+ /* If there are no high priority owners for a node,
+ then immediately wake up low priority owners, so
+ they can detect possible deadlock */
+ if (lock->nr_hipri_owners == 0)
+ wake_up_all_lopri_owners(node);
+ }
+ list_add_tail(&owner->requestors_link, &lock->requestors);
+
+ /* Ok, here we have prepared a lock request, so unlock
+ a znode ... */
+ spin_unlock_zlock(lock);
+ /* ... and sleep */
+ reiser4_go_to_sleep(owner);
+ if (owner->request.mode == ZNODE_NO_LOCK)
+ goto request_is_done;
+ spin_lock_zlock(lock);
+ if (owner->request.mode == ZNODE_NO_LOCK) {
+ spin_unlock_zlock(lock);
+request_is_done:
+ if (owner->request.ret_code == 0) {
+ LOCK_CNT_INC(long_term_locked_znode);
+ zref(node);
+ }
+ return owner->request.ret_code;
+ }
+ remove_lock_request(owner);
+ }
+
+ return lock_tail(owner, ret, mode);
+}
+
+/* lock object invalidation means changing of lock object state to `INVALID'
+ and waiting for all other processes to cancel theirs lock requests. */
+void reiser4_invalidate_lock(lock_handle * handle /* path to lock
+ * owner and lock
+ * object is being
+ * invalidated. */ )
+{
+ znode *node = handle->node;
+ lock_stack *owner = handle->owner;
+
+ assert("zam-325", owner == get_current_lock_stack());
+ assert("zam-103", znode_is_write_locked(node));
+ assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
+ assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
+ assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
+ assert("nikita-3097", znode_is_wlocked_once(node));
+ assert_spin_locked(&(node->lock.guard));
+
+ if (handle->signaled)
+ atomic_dec(&owner->nr_signaled);
+
+ ZF_SET(node, JNODE_IS_DYING);
+ unlink_object(handle);
+ node->lock.nr_readers = 0;
+
+ invalidate_all_lock_requests(node);
+ spin_unlock_zlock(&node->lock);
+}
+
+/* Initializes lock_stack. */
+void init_lock_stack(lock_stack * owner /* pointer to
+ * allocated
+ * structure. */ )
+{
+ INIT_LIST_HEAD(&owner->locks);
+ INIT_LIST_HEAD(&owner->requestors_link);
+ spin_lock_init(&owner->sguard);
+ owner->curpri = 1;
+ init_waitqueue_head(&owner->wait);
+}
+
+/* Initializes lock object. */
+void reiser4_init_lock(zlock * lock /* pointer on allocated
+ * uninitialized lock object
+ * structure. */ )
+{
+ memset(lock, 0, sizeof(zlock));
+ spin_lock_init(&lock->guard);
+ INIT_LIST_HEAD(&lock->requestors);
+ INIT_LIST_HEAD(&lock->owners);
+}
+
+/* Transfer a lock handle (presumably so that variables can be moved between
+ stack and heap locations). */
+static void
+move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
+{
+ znode *node = old->node;
+ lock_stack *owner = old->owner;
+ int signaled;
+
+ /* locks_list, modified by link_object() is not protected by
+ anything. This is valid because only current thread ever modifies
+ locks_list of its lock_stack.
+ */
+ assert("nikita-1827", owner == get_current_lock_stack());
+ assert("nikita-1831", new->owner == NULL);
+
+ spin_lock_zlock(&node->lock);
+
+ signaled = old->signaled;
+ if (unlink_old) {
+ unlink_object(old);
+ } else {
+ if (node->lock.nr_readers > 0) {
+ node->lock.nr_readers += 1;
+ } else {
+ node->lock.nr_readers -= 1;
+ }
+ if (signaled)
+ atomic_inc(&owner->nr_signaled);
+ if (owner->curpri)
+ node->lock.nr_hipri_owners += 1;
+ LOCK_CNT_INC(long_term_locked_znode);
+
+ zref(node);
+ }
+ link_object(new, owner, node);
+ new->signaled = signaled;
+
+ spin_unlock_zlock(&node->lock);
+}
+
+void move_lh(lock_handle * new, lock_handle * old)
+{
+ move_lh_internal(new, old, /*unlink_old */ 1);
+}
+
+void copy_lh(lock_handle * new, lock_handle * old)
+{
+ move_lh_internal(new, old, /*unlink_old */ 0);
+}
+
+/* after getting -E_DEADLOCK we unlock znodes until this function returns false
+ */
+int reiser4_check_deadlock(void)
+{
+ lock_stack *owner = get_current_lock_stack();
+ return atomic_read(&owner->nr_signaled) != 0;
+}
+
+/* Before going to sleep we re-check "release lock" requests which might come
+ from threads with hi-pri lock priorities. */
+int reiser4_prepare_to_sleep(lock_stack * owner)
+{
+ assert("nikita-1847", owner == get_current_lock_stack());
+
+ /* We return -E_DEADLOCK if one or more "give me the lock" messages are
+ * counted in nr_signaled */
+ if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
+ assert("zam-959", !owner->curpri);
+ return RETERR(-E_DEADLOCK);
+ }
+ return 0;
+}
+
+/* Wakes up a single thread */
+void __reiser4_wake_up(lock_stack * owner)
+{
+ atomic_set(&owner->wakeup, 1);
+ wake_up(&owner->wait);
+}
+
+/* Puts a thread to sleep */
+void reiser4_go_to_sleep(lock_stack * owner)
+{
+ /* Well, we might sleep here, so holding of any spinlocks is no-no */
+ assert("nikita-3027", reiser4_schedulable());
+
+ wait_event(owner->wait, atomic_read(&owner->wakeup));
+ atomic_set(&owner->wakeup, 0);
+}
+
+int lock_stack_isclean(lock_stack * owner)
+{
+ if (list_empty_careful(&owner->locks)) {
+ assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+#if REISER4_DEBUG
+
+/*
+ * debugging functions
+ */
+
+static void list_check(struct list_head *head)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ assert("", (pos->prev != NULL && pos->next != NULL &&
+ pos->prev->next == pos && pos->next->prev == pos));
+}
+
+/* check consistency of locking data-structures hanging of the @stack */
+static void check_lock_stack(lock_stack * stack)
+{
+ spin_lock_stack(stack);
+ /* check that stack->locks is not corrupted */
+ list_check(&stack->locks);
+ spin_unlock_stack(stack);
+}
+
+/* check consistency of locking data structures */
+void check_lock_data(void)
+{
+ check_lock_stack(&get_current_context()->stack);
+}
+
+/* check consistency of locking data structures for @node */
+void check_lock_node_data(znode * node)
+{
+ spin_lock_zlock(&node->lock);
+ list_check(&node->lock.owners);
+ list_check(&node->lock.requestors);
+ spin_unlock_zlock(&node->lock);
+}
+
+/* check that given lock request is dead lock safe. This check is, of course,
+ * not exhaustive. */
+static int
+request_is_deadlock_safe(znode * node, znode_lock_mode mode,
+ znode_lock_request request)
+{
+ lock_stack *owner;
+
+ owner = get_current_lock_stack();
+ /*
+ * check that hipri lock request is not issued when there are locked
+ * nodes at the higher levels.
+ */
+ if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
+ znode_get_level(node) != 0) {
+ lock_handle *item;
+
+ list_for_each_entry(item, &owner->locks, locks_link) {
+ znode *other;
+
+ other = item->node;
+
+ if (znode_get_level(other) == 0)
+ continue;
+ if (znode_get_level(other) > znode_get_level(node))
+ return 0;
+ }
+ }
+ return 1;
+}
+
+#endif
+
+/* return pointer to static storage with name of lock_mode. For
+ debugging */
+const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */)
+{
+ if (lock == ZNODE_READ_LOCK)
+ return "read";
+ else if (lock == ZNODE_WRITE_LOCK)
+ return "write";
+ else {
+ static char buf[30];
+
+ sprintf(buf, "unknown: %i", lock);
+ return buf;
+ }
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 79
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/lock.h linux-5.10.2/fs/reiser4/lock.h
--- linux-5.10.2.orig/fs/reiser4/lock.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/lock.h 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,250 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Long term locking data structures. See lock.c for details. */
+
+#ifndef __LOCK_H__
+#define __LOCK_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/node/node.h"
+#include "txnmgr.h"
+#include "readahead.h"
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+#include <asm/atomic.h>
+#include <linux/wait.h>
+
+/* Per-znode lock object */
+struct zlock {
+ spinlock_t guard;
+ /* The number of readers if positive; the number of recursively taken
+ write locks if negative. Protected by zlock spin lock. */
+ int nr_readers;
+ /* A number of processes (lock_stacks) that have this object
+ locked with high priority */
+ unsigned nr_hipri_owners;
+ /* A number of attempts to lock znode in high priority direction */
+ unsigned nr_hipri_requests;
+ /* A linked list of lock_handle objects that contains pointers
+ for all lock_stacks which have this lock object locked */
+ unsigned nr_hipri_write_requests;
+ struct list_head owners;
+ /* A linked list of lock_stacks that wait for this lock */
+ struct list_head requestors;
+};
+
+static inline void spin_lock_zlock(zlock *lock)
+{
+ /* check that zlock is not locked */
+ assert("", LOCK_CNT_NIL(spin_locked_zlock));
+ /* check that spinlocks of lower priorities are not held */
+ assert("", LOCK_CNT_NIL(spin_locked_stack));
+
+ spin_lock(&lock->guard);
+
+ LOCK_CNT_INC(spin_locked_zlock);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void spin_unlock_zlock(zlock *lock)
+{
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(spin_locked_zlock);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&lock->guard);
+}
+
+#define lock_is_locked(lock) ((lock)->nr_readers != 0)
+#define lock_is_rlocked(lock) ((lock)->nr_readers > 0)
+#define lock_is_wlocked(lock) ((lock)->nr_readers < 0)
+#define lock_is_wlocked_once(lock) ((lock)->nr_readers == -1)
+#define lock_can_be_rlocked(lock) ((lock)->nr_readers >= 0)
+#define lock_mode_compatible(lock, mode) \
+ (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
+ ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
+
+/* Since we have R/W znode locks we need additional bidirectional `link'
+ objects to implement n<->m relationship between lock owners and lock
+ objects. We call them `lock handles'.
+
+ Locking: see lock.c/"SHORT-TERM LOCKING"
+*/
+struct lock_handle {
+ /* This flag indicates that a signal to yield a lock was passed to
+ lock owner and counted in owner->nr_signalled
+
+ Locking: this is accessed under spin lock on ->node.
+ */
+ int signaled;
+ /* A link to owner of a lock */
+ lock_stack *owner;
+ /* A link to znode locked */
+ znode *node;
+ /* A list of all locks for a process */
+ struct list_head locks_link;
+ /* A list of all owners for a znode */
+ struct list_head owners_link;
+};
+
+struct lock_request {
+ /* A pointer to uninitialized link object */
+ lock_handle *handle;
+ /* A pointer to the object we want to lock */
+ znode *node;
+ /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
+ znode_lock_mode mode;
+ /* how dispatch_lock_requests() returns lock request result code */
+ int ret_code;
+};
+
+/* A lock stack structure for accumulating locks owned by a process */
+struct lock_stack {
+ /* A guard lock protecting a lock stack */
+ spinlock_t sguard;
+ /* number of znodes which were requested by high priority processes */
+ atomic_t nr_signaled;
+ /* Current priority of a process
+
+ This is only accessed by the current thread and thus requires no
+ locking.
+ */
+ int curpri;
+ /* A list of all locks owned by this process. Elements can be added to
+ * this list only by the current thread. ->node pointers in this list
+ * can be only changed by the current thread. */
+ struct list_head locks;
+ /* When lock_stack waits for the lock, it puts itself on double-linked
+ requestors list of that lock */
+ struct list_head requestors_link;
+ /* Current lock request info.
+
+ This is only accessed by the current thread and thus requires no
+ locking.
+ */
+ struct lock_request request;
+ /* the following two fields are the lock stack's
+ * synchronization object to use with the standard linux/wait.h
+ * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
+ * usage details. */
+ wait_queue_head_t wait;
+ atomic_t wakeup;
+#if REISER4_DEBUG
+ int nr_locks; /* number of lock handles in the above list */
+#endif
+};
+
+/*
+ User-visible znode locking functions
+*/
+
+extern int longterm_lock_znode(lock_handle * handle,
+ znode * node,
+ znode_lock_mode mode,
+ znode_lock_request request);
+
+extern void longterm_unlock_znode(lock_handle * handle);
+
+extern int reiser4_check_deadlock(void);
+
+extern lock_stack *get_current_lock_stack(void);
+
+extern void init_lock_stack(lock_stack * owner);
+extern void reiser4_init_lock(zlock * lock);
+
+static inline void init_lh(lock_handle *lh)
+{
+#if REISER4_DEBUG
+ memset(lh, 0, sizeof *lh);
+ INIT_LIST_HEAD(&lh->locks_link);
+ INIT_LIST_HEAD(&lh->owners_link);
+#else
+ lh->node = NULL;
+#endif
+}
+
+static inline void done_lh(lock_handle *lh)
+{
+ assert("zam-342", lh != NULL);
+ if (lh->node != NULL)
+ longterm_unlock_znode(lh);
+}
+
+extern void move_lh(lock_handle * new, lock_handle * old);
+extern void copy_lh(lock_handle * new, lock_handle * old);
+
+extern int reiser4_prepare_to_sleep(lock_stack * owner);
+extern void reiser4_go_to_sleep(lock_stack * owner);
+extern void __reiser4_wake_up(lock_stack * owner);
+
+extern int lock_stack_isclean(lock_stack * owner);
+
+/* zlock object state check macros: only used in assertions. Both forms imply
+ that the lock is held by the current thread. */
+extern int znode_is_write_locked(const znode *);
+extern void reiser4_invalidate_lock(lock_handle *);
+
+/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
+#define spin_ordering_pred_stack(stack) \
+ (LOCK_CNT_NIL(spin_locked_stack) && \
+ LOCK_CNT_NIL(spin_locked_txnmgr) && \
+ LOCK_CNT_NIL(spin_locked_inode) && \
+ LOCK_CNT_NIL(rw_locked_cbk_cache) && \
+ LOCK_CNT_NIL(spin_locked_super_eflush))
+
+static inline void spin_lock_stack(lock_stack *stack)
+{
+ assert("", spin_ordering_pred_stack(stack));
+ spin_lock(&(stack->sguard));
+ LOCK_CNT_INC(spin_locked_stack);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void spin_unlock_stack(lock_stack *stack)
+{
+ assert_spin_locked(&(stack->sguard));
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+ LOCK_CNT_DEC(spin_locked_stack);
+ LOCK_CNT_DEC(spin_locked);
+ spin_unlock(&(stack->sguard));
+}
+
+static inline void reiser4_wake_up(lock_stack * owner)
+{
+ spin_lock_stack(owner);
+ __reiser4_wake_up(owner);
+ spin_unlock_stack(owner);
+}
+
+const char *lock_mode_name(znode_lock_mode lock);
+
+#if REISER4_DEBUG
+extern void check_lock_data(void);
+extern void check_lock_node_data(znode * node);
+#else
+#define check_lock_data() noop
+#define check_lock_node_data() noop
+#endif
+
+/* __LOCK_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/Makefile linux-5.10.2/fs/reiser4/Makefile
--- linux-5.10.2.orig/fs/reiser4/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/Makefile 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,115 @@
+#
+# reiser4/Makefile
+#
+
+obj-$(CONFIG_REISER4_FS) += reiser4.o
+
+reiser4-y := \
+ debug.o \
+ jnode.o \
+ znode.o \
+ key.o \
+ pool.o \
+ tree_mod.o \
+ estimate.o \
+ carry.o \
+ carry_ops.o \
+ lock.o \
+ tree.o \
+ context.o \
+ tap.o \
+ coord.o \
+ block_alloc.o \
+ txnmgr.o \
+ kassign.o \
+ flush.o \
+ wander.o \
+ eottl.o \
+ search.o \
+ page_cache.o \
+ seal.o \
+ dscale.o \
+ flush_queue.o \
+ ktxnmgrd.o \
+ blocknrset.o \
+ super.o \
+ init_volume.o \
+ super_ops.o \
+ volume_ops.o \
+ fsdata.o \
+ export_ops.o \
+ oid.o \
+ tree_walk.o \
+ inode.o \
+ vfs_ops.o \
+ as_ops.o \
+ entd.o\
+ readahead.o \
+ status_flags.o \
+ init_super.o \
+ safe_link.o \
+ blocknrlist.o \
+ discard.o \
+ checksum.o \
+ \
+ plugin/plugin.o \
+ plugin/plugin_set.o \
+ plugin/node/node.o \
+ plugin/object.o \
+ plugin/cluster.o \
+ plugin/txmod.o \
+ plugin/inode_ops.o \
+ plugin/inode_ops_rename.o \
+ plugin/file_ops.o \
+ plugin/file_ops_readdir.o \
+ plugin/file_plugin_common.o \
+ plugin/file/file.o \
+ plugin/file/tail_conversion.o \
+ plugin/file/file_conversion.o \
+ plugin/file/symlink.o \
+ plugin/file/cryptcompress.o \
+ plugin/file/stripe.o \
+ plugin/dir_plugin_common.o \
+ plugin/dir/hashed_dir.o \
+ plugin/dir/seekable_dir.o \
+ plugin/node/node40.o \
+ plugin/node/node41.o \
+ \
+ plugin/crypto/cipher.o \
+ plugin/crypto/digest.o \
+ \
+ plugin/compress/compress.o \
+ plugin/compress/compress_mode.o \
+ \
+ plugin/volume/volume.o \
+ \
+ plugin/dst/hash.o \
+ plugin/dst/fsx32.o \
+ plugin/dst/dst.o \
+ \
+ plugin/item/static_stat.o \
+ plugin/item/sde.o \
+ plugin/item/cde.o \
+ plugin/item/blackbox.o \
+ plugin/item/brick_symbol.o \
+ plugin/item/internal.o \
+ plugin/item/tail.o \
+ plugin/item/ctail.o \
+ plugin/item/extent.o \
+ plugin/item/extent_item_ops.o \
+ plugin/item/extent_file_ops.o \
+ plugin/item/extent_stripe_ops.o \
+ plugin/item/extent_flush_ops.o \
+ plugin/item/extent_volume_ops.o \
+ \
+ plugin/hash.o \
+ plugin/fibration.o \
+ plugin/tail_policy.o \
+ plugin/item/item.o \
+ \
+ plugin/security/perm.o \
+ plugin/space/bitmap.o \
+ \
+ plugin/disk_format/disk_format40.o \
+ plugin/disk_format/disk_format.o
+
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/oid.c linux-5.10.2/fs/reiser4/oid.c
--- linux-5.10.2.orig/fs/reiser4/oid.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/oid.c 2020-12-23 16:07:46.119813143 +0100
@@ -0,0 +1,141 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "debug.h"
+#include "super.h"
+#include "txnmgr.h"
+
+/* we used to have oid allocation plugin. It was removed because it
+ was recognized as providing unneeded level of abstraction. If one
+ ever will find it useful - look at yet_unneeded_abstractions/oid
+*/
+
+/*
+ * initialize in-memory data for oid allocator at @super. @nr_files and @next
+ * are provided by disk format plugin that reads them from the disk during
+ * mount.
+ */
+int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_super_private(super);
+
+ sbinfo->next_to_use = next;
+ sbinfo->oids_in_use = nr_files;
+ return 0;
+}
+
+/*
+ * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
+ * runs out of oids.
+ */
+oid_t oid_allocate(struct super_block *super)
+{
+ reiser4_super_info_data *sbinfo;
+ oid_t oid;
+
+ sbinfo = get_super_private(super);
+
+ spin_lock_reiser4_super(sbinfo);
+ if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
+ oid = sbinfo->next_to_use++;
+ sbinfo->oids_in_use++;
+ } else
+ oid = ABSOLUTE_MAX_OID;
+ spin_unlock_reiser4_super(sbinfo);
+ return oid;
+}
+
+/*
+ * Tell oid allocator that @oid is now free.
+ */
+int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
+{
+ reiser4_super_info_data *sbinfo;
+
+ sbinfo = get_super_private(super);
+
+ spin_lock_reiser4_super(sbinfo);
+ sbinfo->oids_in_use--;
+ spin_unlock_reiser4_super(sbinfo);
+ return 0;
+}
+
+/*
+ * return next @oid that would be allocated (i.e., returned by oid_allocate())
+ * without actually allocating it. This is used by disk format plugin to save
+ * oid allocator state on the disk.
+ */
+oid_t oid_next(const struct super_block *super)
+{
+ reiser4_super_info_data *sbinfo;
+ oid_t oid;
+
+ sbinfo = get_super_private(super);
+
+ spin_lock_reiser4_super(sbinfo);
+ oid = sbinfo->next_to_use;
+ spin_unlock_reiser4_super(sbinfo);
+ return oid;
+}
+
+/*
+ * returns number of currently used oids. This is used by statfs(2) to report
+ * number of "inodes" and by disk format plugin to save oid allocator state on
+ * the disk.
+ */
+long oids_used(const struct super_block *super)
+{
+ reiser4_super_info_data *sbinfo;
+ oid_t used;
+
+ sbinfo = get_super_private(super);
+
+ spin_lock_reiser4_super(sbinfo);
+ used = sbinfo->oids_in_use;
+ spin_unlock_reiser4_super(sbinfo);
+ if (used < (__u64) ((long)~0) >> 1)
+ return (long)used;
+ else
+ return (long)-1;
+}
+
+/*
+ * Count oid as allocated in atom. This is done after call to oid_allocate()
+ * at the point when we are irrevocably committed to creation of the new file
+ * (i.e., when oid allocation cannot be any longer rolled back due to some
+ * error).
+ */
+void oid_count_allocated(void)
+{
+ txn_atom *atom;
+
+ atom = get_current_atom_locked();
+ atom->nr_objects_created++;
+ spin_unlock_atom(atom);
+}
+
+/*
+ * Count oid as free in atom. This is done after call to oid_release() at the
+ * point when we are irrevocably committed to the deletion of the file (i.e.,
+ * when oid release cannot be any longer rolled back due to some error).
+ */
+void oid_count_released(void)
+{
+ txn_atom *atom;
+
+ atom = get_current_atom_locked();
+ atom->nr_objects_deleted++;
+ spin_unlock_atom(atom);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/page_cache.c linux-5.10.2/fs/reiser4/page_cache.c
--- linux-5.10.2.orig/fs/reiser4/page_cache.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/page_cache.c 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,686 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Memory pressure hooks. Fake inodes handling. */
+
+/* GLOSSARY
+
+ . Formatted and unformatted nodes.
+ Elements of reiser4 balanced tree to store data and metadata.
+ Unformatted nodes are pointed to by extent pointers. Such nodes
+ are used to store data of large objects. Unlike unformatted nodes,
+ formatted ones have associated format described by node4X plugin.
+
+ . Jnode (or journal node)
+ The in-memory header which is used to track formatted and unformatted
+ nodes, bitmap nodes, etc. In particular, jnodes are used to track
+ transactional information associated with each block(see reiser4/jnode.c
+ for details).
+
+ . Znode
+ The in-memory header which is used to track formatted nodes. Contains
+ embedded jnode (see reiser4/znode.c for details).
+*/
+
+/* We store all file system meta data (and data, of course) in the page cache.
+
+ What does this mean? In stead of using bread/brelse we create special
+ "fake" inode (one per super block) and store content of formatted nodes
+ into pages bound to this inode in the page cache. In newer kernels bread()
+ already uses inode attached to block device (bd_inode). Advantage of having
+ our own fake inode is that we can install appropriate methods in its
+ address_space operations. Such methods are called by VM on memory pressure
+ (or during background page flushing) and we can use them to react
+ appropriately.
+
+ In initial version we only support one block per page. Support for multiple
+ blocks per page is complicated by relocation.
+
+ To each page, used by reiser4, jnode is attached. jnode is analogous to
+ buffer head. Difference is that jnode is bound to the page permanently:
+ jnode cannot be removed from memory until its backing page is.
+
+ jnode contain pointer to page (->pg field) and page contain pointer to
+ jnode in ->private field. Pointer from jnode to page is protected to by
+ jnode's spinlock and pointer from page to jnode is protected by page lock
+ (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
+ lock. To go into reverse direction use jnode_lock_page() function that uses
+ standard try-lock-and-release device.
+
+ Properties:
+
+ 1. when jnode-to-page mapping is established (by jnode_attach_page()), page
+ reference counter is increased.
+
+ 2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
+ reference counter is decreased.
+
+ 3. on jload() reference counter on jnode page is increased, page is
+ kmapped and `referenced'.
+
+ 4. on jrelse() inverse operations are performed.
+
+ 5. kmapping/kunmapping of unformatted pages is done by read/write methods.
+
+ DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
+ historically.]
+
+ [In the following discussion, `lock' invariably means long term lock on
+ znode.] (What about page locks?)
+
+ There is some special class of deadlock possibilities related to memory
+ pressure. Locks acquired by other reiser4 threads are accounted for in
+ deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
+ invoked additional hidden arc is added to the locking graph: thread that
+ tries to allocate memory waits for ->vm_writeback() to finish. If this
+ thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
+ prevention is useless.
+
+ Another related problem is possibility for ->vm_writeback() to run out of
+ memory itself. This is not a problem for ext2 and friends, because their
+ ->vm_writeback() don't allocate much memory, but reiser4 flush is
+ definitely able to allocate huge amounts of memory.
+
+ It seems that there is no reliable way to cope with the problems above. In
+ stead it was decided that ->vm_writeback() (as invoked in the kswapd
+ context) wouldn't perform any flushing itself, but rather should just wake
+ up some auxiliary thread dedicated for this purpose (or, the same thread
+ that does periodic commit of old atoms (ktxnmgrd.c)).
+
+ Details:
+
+ 1. Page is called `reclaimable' against particular reiser4 mount F if this
+ page can be ultimately released by try_to_free_pages() under presumptions
+ that:
+
+ a. ->vm_writeback() for F is no-op, and
+
+ b. none of the threads accessing F are making any progress, and
+
+ c. other reiser4 mounts obey the same memory reservation protocol as F
+ (described below).
+
+ For example, clean un-pinned page, or page occupied by ext2 data are
+ reclaimable against any reiser4 mount.
+
+ When there is more than one reiser4 mount in a system, condition (c) makes
+ reclaim-ability not easily verifiable beyond trivial cases mentioned above.
+
+ THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
+
+ Fake inode is used to bound formatted nodes and each node is indexed within
+ fake inode by its block number. If block size of smaller than page size, it
+ may so happen that block mapped to the page with formatted node is occupied
+ by unformatted node or is unallocated. This lead to some complications,
+ because flushing whole page can lead to an incorrect overwrite of
+ unformatted node that is moreover, can be cached in some other place as
+ part of the file body. To avoid this, buffers for unformatted nodes are
+ never marked dirty. Also pages in the fake are never marked dirty. This
+ rules out usage of ->writepage() as memory pressure hook. In stead
+ ->releasepage() is used.
+
+ Josh is concerned that page->buffer is going to die. This should not pose
+ significant problem though, because we need to add some data structures to
+ the page anyway (jnode) and all necessary book keeping can be put there.
+
+*/
+
+/* Life cycle of pages/nodes.
+
+ jnode contains reference to page and page contains reference back to
+ jnode. This reference is counted in page ->count. Thus, page bound to jnode
+ cannot be released back into free pool.
+
+ 1. Formatted nodes.
+
+ 1. formatted node is represented by znode. When new znode is created its
+ ->pg pointer is NULL initially.
+
+ 2. when node content is loaded into znode (by call to zload()) for the
+ first time following happens (in call to ->read_node() or
+ ->allocate_node()):
+
+ 1. new page is added to the page cache.
+
+ 2. this page is attached to znode and its ->count is increased.
+
+ 3. page is kmapped.
+
+ 3. if more calls to zload() follow (without corresponding zrelses), page
+ counter is left intact and in its stead ->d_count is increased in znode.
+
+ 4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
+ ->release_node() is called and page is kunmapped as result.
+
+ 5. at some moment node can be captured by a transaction. Its ->x_count
+ is then increased by transaction manager.
+
+ 6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
+ bit set) following will happen (also see comment at the top of znode.c):
+
+ 1. when last lock is released, node will be uncaptured from
+ transaction. This released reference that transaction manager acquired
+ at the step 5.
+
+ 2. when last reference is released, zput() detects that node is
+ actually deleted and calls ->delete_node()
+ operation. page_cache_delete_node() implementation detaches jnode from
+ page and releases page.
+
+ 7. otherwise (node wasn't removed from the tree), last reference to
+ znode will be released after transaction manager committed transaction
+ node was in. This implies squallocing of this node (see
+ flush.c). Nothing special happens at this point. Znode is still in the
+ hash table and page is still attached to it.
+
+ 8. znode is actually removed from the memory because of the memory
+ pressure, or during umount (znodes_tree_done()). Anyway, znode is
+ removed by the call to zdrop(). At this moment, page is detached from
+ znode and removed from the inode address space.
+
+*/
+
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "super.h"
+#include "entd.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/swap.h> /* for struct page */
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+
+static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp);
+
+static struct address_space_operations formatted_fake_as_ops;
+
+static const oid_t fake_ino = 0x1;
+static const oid_t bitmap_ino = 0x2;
+static const oid_t cc_ino = 0x3;
+
+static void
+init_fake_inode(struct super_block *super, struct inode *fake,
+ struct inode **pfake)
+{
+ assert("nikita-2168", fake->i_state & I_NEW);
+ fake->i_mapping->a_ops = &formatted_fake_as_ops;
+ inode_attach_wb(fake, NULL);
+ *pfake = fake;
+ /* NOTE-NIKITA something else? */
+ unlock_new_inode(fake);
+}
+
+/**
+ * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
+ * @super: super block to init fake inode for
+ *
+ * Initializes fake inode to which formatted nodes are bound in the page cache
+ * and inode for bitmaps.
+ */
+int reiser4_init_formatted_fake(struct super_block *super)
+{
+ struct inode *fake;
+ struct inode *bitmap;
+ struct inode *cc;
+ reiser4_super_info_data *sinfo;
+
+ assert("nikita-1703", super != NULL);
+
+ sinfo = get_super_private_nocheck(super);
+ fake = iget_locked(super, oid_to_ino(fake_ino));
+
+ if (fake != NULL) {
+ init_fake_inode(super, fake, &sinfo->fake);
+
+ bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
+ if (bitmap != NULL) {
+ init_fake_inode(super, bitmap, &sinfo->bitmap);
+
+ cc = iget_locked(super, oid_to_ino(cc_ino));
+ if (cc != NULL) {
+ init_fake_inode(super, cc, &sinfo->cc);
+ return 0;
+ } else {
+ iput(sinfo->fake);
+ iput(sinfo->bitmap);
+ sinfo->fake = NULL;
+ sinfo->bitmap = NULL;
+ }
+ } else {
+ iput(sinfo->fake);
+ sinfo->fake = NULL;
+ }
+ }
+ return RETERR(-ENOMEM);
+}
+
+/**
+ * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
+ * @super: super block to init fake inode for
+ *
+ * Releases inodes which were used as address spaces of bitmap and formatted
+ * nodes.
+ */
+void reiser4_done_formatted_fake(struct super_block *super)
+{
+ reiser4_super_info_data *sinfo;
+
+ sinfo = get_super_private_nocheck(super);
+
+ if (sinfo->fake != NULL) {
+ iput(sinfo->fake);
+ sinfo->fake = NULL;
+ }
+
+ if (sinfo->bitmap != NULL) {
+ iput(sinfo->bitmap);
+ sinfo->bitmap = NULL;
+ }
+
+ if (sinfo->cc != NULL) {
+ iput(sinfo->cc);
+ sinfo->cc = NULL;
+ }
+ return;
+}
+
+void reiser4_wait_page_writeback(struct page *page)
+{
+ assert("zam-783", PageLocked(page));
+
+ do {
+ unlock_page(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
+ } while (PageWriteback(page));
+}
+
+/* completion handler for single page bio-based read.
+
+ mpage_end_io_read() would also do. But it's static.
+
+*/
+static void end_bio_single_page_read(struct bio *bio)
+{
+ struct page *page;
+
+ page = bio->bi_io_vec[0].bv_page;
+
+ if (!bio->bi_status)
+ SetPageUptodate(page);
+ else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ bio_put(bio);
+}
+
+/* completion handler for single page bio-based write.
+
+ mpage_end_io_write() would also do. But it's static.
+
+*/
+static void end_bio_single_page_write(struct bio *bio)
+{
+ struct page *page;
+
+ page = bio->bi_io_vec[0].bv_page;
+
+ if (bio->bi_status)
+ SetPageError(page);
+ end_page_writeback(page);
+ bio_put(bio);
+}
+
+/* ->readpage() method for formatted nodes */
+static int formatted_readpage(struct file *f UNUSED_ARG,
+ struct page *page/* page to read */)
+{
+ assert("nikita-2412", PagePrivate(page) && jprivate(page));
+ return reiser4_page_io(page, jprivate(page), READ,
+ reiser4_ctx_gfp_mask_get());
+}
+
+/**
+ * reiser4_page_io - submit single-page bio request
+ * @page: page to perform io for
+ * @node: jnode of page
+ * @rw: read or write
+ * @gfp: gfp mask for bio allocation
+ *
+ * Submits single page read or write.
+ */
+int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
+{
+ struct bio *bio;
+ int result;
+
+ assert("nikita-2094", page != NULL);
+ assert("nikita-2226", PageLocked(page));
+ assert("nikita-2634", node != NULL);
+ assert("nikita-2893", rw == READ || rw == WRITE);
+
+ if (rw) {
+ if (unlikely(IS_RDONLY(page->mapping->host))) {
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ bio = page_bio(page, node, rw, gfp);
+ if (!IS_ERR(bio)) {
+ if (rw == WRITE) {
+ set_page_writeback(page);
+ unlock_page(page);
+ }
+ bio_set_op_attrs(bio, rw, 0);
+ submit_bio(bio);
+ result = 0;
+ } else {
+ unlock_page(page);
+ result = PTR_ERR(bio);
+ }
+
+ return result;
+}
+
+/**
+ * Helper function to construct bio for page
+ */
+static struct bio *page_bio(struct page *page, jnode *node, int rw, gfp_t gfp)
+{
+ struct bio *bio;
+ assert("nikita-2092", page != NULL);
+ assert("nikita-2633", node != NULL);
+ /*
+ * Simple implementation in the assumption that blocksize == pagesize.
+ *
+ * We only have to submit one block, but submit_bh() will allocate bio
+ * anyway, so lets use all the bells-and-whistles of bio code.
+ */
+ bio = bio_alloc(gfp, 1);
+ if (bio != NULL) {
+ int blksz;
+ struct super_block *super;
+ reiser4_block_nr blocknr;
+
+ super = page->mapping->host->i_sb;
+ assert("nikita-2029", super != NULL);
+ blksz = super->s_blocksize;
+ assert("nikita-2028", blksz == (int)PAGE_SIZE);
+
+ spin_lock_jnode(node);
+ blocknr = *jnode_get_io_block(node);
+ spin_unlock_jnode(node);
+
+ assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
+ assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
+
+ bio_set_dev(bio, jnode_get_subvol(node)->bdev);
+ /*
+ * fill bio->bi_iter.bi_sector before calling bio_add_page(),
+ * because q->merge_bvec_fn may want to inspect it (see
+ * drivers/md/linear.c:linear_mergeable_bvec() for example.
+ */
+ bio->bi_iter.bi_sector = blocknr * (blksz >> 9);
+
+ if (!bio_add_page(bio, page, blksz, 0)) {
+ warning("nikita-3452",
+ "Single page bio cannot be constructed");
+ return ERR_PTR(RETERR(-EINVAL));
+ }
+ /*
+ * bio -> bi_idx is filled by bio_init()
+ */
+ bio->bi_end_io = (rw == READ) ?
+ end_bio_single_page_read : end_bio_single_page_write;
+ return bio;
+ } else
+ return ERR_PTR(RETERR(-ENOMEM));
+}
+
+#if 0
+static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
+{
+ if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
+ return 1;
+ if (ctx->super != s)
+ return 1;
+ if (get_super_private(s)->entd.tsk == current)
+ return 0;
+ if (!lock_stack_isclean(&ctx->stack))
+ return 0;
+ if (ctx->trans->atom != NULL)
+ return 0;
+ return 1;
+}
+#endif
+
+/**
+ * reiser4_writepage - writepage of struct address_space_operations
+ * @page: page to write
+ * @wbc:
+ *
+ *
+ */
+/* Common memory pressure notification. */
+int reiser4_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ /*
+ * assert("edward-1562",
+ * can_hit_entd(get_current_context_check(), sb));
+ */
+ assert("vs-828", PageLocked(page));
+
+ return write_page_by_ent(page, wbc);
+}
+
+/* ->set_page_dirty() method of formatted address_space */
+static int formatted_set_page_dirty(struct page *page)
+{
+ assert("nikita-2173", page != NULL);
+ BUG();
+ return __set_page_dirty_nobuffers(page);
+}
+
+/* writepages method of address space operations in reiser4 is used to involve
+ into transactions pages which are dirtied via mmap. Only regular files can
+ have such pages. Fake inode is used to access formatted nodes via page
+ cache. As formatted nodes can never be mmaped, fake inode's writepages has
+ nothing to do */
+static int
+writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
+{
+ return 0;
+}
+
+/* address space operations for the fake inode */
+static struct address_space_operations formatted_fake_as_ops = {
+ /* Perform a writeback of a single page as a memory-freeing
+ * operation. */
+ .writepage = reiser4_writepage,
+ /* this is called to read formatted node */
+ .readpage = formatted_readpage,
+ /* ->sync_page() method of fake inode address space operations. Called
+ from wait_on_page() and lock_page().
+
+ This is most annoyingly misnomered method. Actually it is called
+ from wait_on_page_bit() and lock_page() and its purpose is to
+ actually start io by jabbing device drivers.
+ .sync_page = block_sync_page,
+ */
+ /* Write back some dirty pages from this mapping. Called from sync.
+ called during sync (pdflush) */
+ .writepages = writepages_fake,
+ /* Set a page dirty */
+ .set_page_dirty = formatted_set_page_dirty,
+ /* used for read-ahead. Not applicable */
+ .readpages = NULL,
+ .write_begin = NULL,
+ .write_end = NULL,
+ .bmap = NULL,
+ /* called just before page is being detached from inode mapping and
+ removed from memory. Called on truncate, cut/squeeze, and
+ umount. */
+ .invalidatepage = reiser4_invalidatepage,
+ /* this is called by shrink_cache() so that file system can try to
+ release objects (jnodes, buffers, journal heads) attached to page
+ and, may be made page itself free-able.
+ */
+ .releasepage = reiser4_releasepage,
+ .direct_IO = NULL,
+ .migratepage = reiser4_migratepage,
+ .batch_lock_tabu = 1
+};
+
+/* called just before page is released (no longer used by reiser4). Callers:
+ jdelete() and extent2tail(). */
+void reiser4_drop_page(struct page *page)
+{
+ assert("nikita-2181", PageLocked(page));
+ clear_page_dirty_for_io(page);
+ ClearPageUptodate(page);
+#if defined(PG_skipped)
+ ClearPageSkipped(page);
+#endif
+ unlock_page(page);
+}
+
+#define JNODE_GANG_SIZE (16)
+
+/* find all jnodes from range specified and invalidate them */
+static int truncate_jnodes_range(struct inode *inode,
+ pgoff_t from, pgoff_t count)
+{
+ reiser4_inode *info;
+ int truncated_jnodes;
+ unsigned long index;
+ unsigned long end;
+
+ if (inode_file_plugin(inode) ==
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
+ /*
+ * No need to get rid of jnodes here: if the single jnode of
+ * page cluster did not have page, then it was found and killed
+ * before in
+ * truncate_complete_page_cluster()->jput()->jput_final(),
+ * otherwise it will be dropped by reiser4_invalidatepage()
+ */
+ return 0;
+ truncated_jnodes = 0;
+
+ info = reiser4_inode_data(inode);
+
+ index = from;
+ end = from + count;
+
+ while (1) {
+ jnode *gang[JNODE_GANG_SIZE];
+ int taken;
+ int i;
+ jnode *node;
+
+ assert("nikita-3466", index <= end);
+
+ read_lock_tree();
+ taken =
+ radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
+ (void **)gang, index,
+ JNODE_GANG_SIZE);
+ for (i = 0; i < taken; ++i) {
+ node = gang[i];
+ if (index_jnode(node) < end)
+ jref(node);
+ else
+ gang[i] = NULL;
+ }
+ read_unlock_tree();
+
+ for (i = 0; i < taken; ++i) {
+ node = gang[i];
+ if (node != NULL) {
+ index = max(index, index_jnode(node));
+ spin_lock_jnode(node);
+ assert("edward-1457", node->pg == NULL);
+ /* this is always called after
+ truncate_inode_pages_range(). Therefore, here
+ jnode can not have page. New pages can not be
+ created because truncate_jnodes_range goes
+ under exclusive access on file obtained,
+ where as new page creation requires
+ non-exclusive access obtained */
+ JF_SET(node, JNODE_HEARD_BANSHEE);
+ reiser4_uncapture_jnode(node);
+ unhash_unformatted_jnode(node);
+ truncated_jnodes++;
+ jput(node);
+ } else
+ break;
+ }
+ if (i != taken || taken == 0)
+ break;
+ }
+ return truncated_jnodes;
+}
+
+/* Truncating files in reiser4: problems and solutions.
+
+ VFS calls fs's truncate after it has called truncate_inode_pages()
+ to get rid of pages corresponding to part of file being truncated.
+ In reiser4 it may cause existence of unallocated extents which do
+ not have jnodes. Flush code does not expect that. Solution of this
+ problem is straightforward. As vfs's truncate is implemented using
+ setattr operation, it seems reasonable to have ->setattr() that
+ will cut file body. However, flush code also does not expect dirty
+ pages without parent items, so it is impossible to cut all items,
+ then truncate all pages in two steps. We resolve this problem by
+ cutting items one-by-one. Each such fine-grained step performed
+ under longterm znode lock calls at the end ->kill_hook() method of
+ a killed item to remove its binded pages and jnodes.
+
+ The following function is a common part of mentioned kill hooks.
+ Also, this is called before tail-to-extent conversion (to not manage
+ few copies of the data).
+*/
+void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
+ unsigned long count, int even_cows)
+{
+ loff_t from_bytes, count_bytes;
+
+ if (count == 0)
+ return;
+ from_bytes = ((loff_t) from) << PAGE_SHIFT;
+ count_bytes = ((loff_t) count) << PAGE_SHIFT;
+
+ unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
+ truncate_inode_pages_range(mapping, from_bytes,
+ from_bytes + count_bytes - 1);
+ truncate_jnodes_range(mapping->host, from, count);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/page_cache.h linux-5.10.2/fs/reiser4/page_cache.h
--- linux-5.10.2.orig/fs/reiser4/page_cache.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/page_cache.h 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,60 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
+
+#if !defined(__REISER4_PAGE_CACHE_H__)
+#define __REISER4_PAGE_CACHE_H__
+
+#include "forward.h"
+#include "context.h" /* for reiser4_ctx_gfp_mask_get() */
+
+#include <linux/fs.h> /* for struct super_block, address_space */
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h> /* for lock_page() */
+#include <linux/vmalloc.h> /* for __vmalloc() */
+
+extern int reiser4_init_formatted_fake(struct super_block *);
+extern void reiser4_done_formatted_fake(struct super_block *);
+
+extern void reiser4_wait_page_writeback(struct page *);
+static inline void lock_and_wait_page_writeback(struct page *page)
+{
+ lock_page(page);
+ if (unlikely(PageWriteback(page)))
+ reiser4_wait_page_writeback(page);
+}
+
+#define jprivate(page) ((jnode *)page_private(page))
+
+extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
+extern void reiser4_drop_page(struct page *);
+extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
+ unsigned long count, int even_cows);
+extern void capture_reiser4_inodes(struct super_block *,
+ struct writeback_control *);
+static inline void *reiser4_vmalloc(unsigned long size)
+{
+ return __vmalloc(size, reiser4_ctx_gfp_mask_get());
+}
+
+#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
+
+#if REISER4_DEBUG
+extern void print_page(const char *prefix, struct page *page);
+#else
+#define print_page(prf, p) noop
+#endif
+
+/* __REISER4_PAGE_CACHE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/cluster.c linux-5.10.2/fs/reiser4/plugin/cluster.c
--- linux-5.10.2.orig/fs/reiser4/plugin/cluster.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/cluster.c 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,72 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Contains reiser4 cluster plugins (see
+ http://www.namesys.com/cryptcompress_design.html
+ "Concepts of clustering" for details). */
+
+#include "plugin_header.h"
+#include "plugin.h"
+#include "../inode.h"
+
+static int change_cluster(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ assert("edward-1324", inode != NULL);
+ assert("edward-1325", plugin != NULL);
+ assert("edward-1326", is_reiser4_inode(inode));
+ assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
+
+ /* Can't change the cluster plugin for already existent regular files */
+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
+ return RETERR(-EINVAL);
+
+ /* If matches, nothing to change. */
+ if (inode_hash_plugin(inode) != NULL &&
+ inode_hash_plugin(inode)->h.id == plugin->h.id)
+ return 0;
+
+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
+ PSET_CLUSTER, plugin);
+}
+
+static reiser4_plugin_ops cluster_plugin_ops = {
+ .init = NULL,
+ .load = NULL,
+ .save_len = NULL,
+ .save = NULL,
+ .change = &change_cluster
+};
+
+#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC) \
+ [CLUSTER_ ## ID ## _ID] = { \
+ .h = { \
+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
+ .id = CLUSTER_ ## ID ## _ID, \
+ .pops = &cluster_plugin_ops, \
+ .label = LABEL, \
+ .desc = DESC, \
+ .linkage = {NULL, NULL} \
+ }, \
+ .shift = SHIFT \
+ }
+
+cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
+ SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
+ SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
+ SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
+ SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
+ SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/cluster.h linux-5.10.2/fs/reiser4/plugin/cluster.h
--- linux-5.10.2.orig/fs/reiser4/plugin/cluster.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/cluster.h 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,410 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This file contains size/offset translators, modulators
+ and other helper functions. */
+
+#if !defined(__FS_REISER4_CLUSTER_H__)
+#define __FS_REISER4_CLUSTER_H__
+
+#include "../inode.h"
+
+static inline int inode_cluster_shift(struct inode *inode)
+{
+ assert("edward-92", inode != NULL);
+ assert("edward-93", reiser4_inode_data(inode) != NULL);
+
+ return inode_cluster_plugin(inode)->shift;
+}
+
+static inline unsigned cluster_nrpages_shift(struct inode *inode)
+{
+ return inode_cluster_shift(inode) - PAGE_SHIFT;
+}
+
+/* cluster size in page units */
+static inline unsigned cluster_nrpages(struct inode *inode)
+{
+ return 1U << cluster_nrpages_shift(inode);
+}
+
+static inline size_t inode_cluster_size(struct inode *inode)
+{
+ assert("edward-96", inode != NULL);
+
+ return 1U << inode_cluster_shift(inode);
+}
+
+static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
+{
+ return idx >> cluster_nrpages_shift(inode);
+}
+
+static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
+{
+ return idx << cluster_nrpages_shift(inode);
+}
+
+static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
+{
+ return clust_to_pg(pg_to_clust(idx, inode), inode);
+}
+
+static inline pgoff_t off_to_pg(loff_t off)
+{
+ return (off >> PAGE_SHIFT);
+}
+
+static inline loff_t pg_to_off(pgoff_t idx)
+{
+ return ((loff_t) (idx) << PAGE_SHIFT);
+}
+
+static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
+{
+ return off >> inode_cluster_shift(inode);
+}
+
+static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
+{
+ return (loff_t) idx << inode_cluster_shift(inode);
+}
+
+static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
+{
+ return clust_to_off(off_to_clust(off, inode), inode);
+}
+
+static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
+{
+ return clust_to_pg(off_to_clust(off, inode), inode);
+}
+
+static inline unsigned off_to_pgoff(loff_t off)
+{
+ return off & (PAGE_SIZE - 1);
+}
+
+static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
+{
+ return off & ((loff_t) (inode_cluster_size(inode)) - 1);
+}
+
+static inline pgoff_t offset_in_clust(struct page *page)
+{
+ assert("edward-1488", page != NULL);
+ assert("edward-1489", page->mapping != NULL);
+
+ return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
+}
+
+static inline int first_page_in_cluster(struct page *page)
+{
+ return offset_in_clust(page) == 0;
+}
+
+static inline int last_page_in_cluster(struct page *page)
+{
+ return offset_in_clust(page) ==
+ cluster_nrpages(page->mapping->host) - 1;
+}
+
+static inline unsigned
+pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
+{
+ return off_to_cloff(pg_to_off(idx), inode);
+}
+
+/*********************** Size translators **************************/
+
+/* Translate linear size.
+ * New units are (1 << @blk_shift) times larger, then old ones.
+ * In other words, calculate number of logical blocks, occupied
+ * by @count elements
+ */
+static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
+{
+ return (count + (1UL << blkbits) - 1) >> blkbits;
+}
+
+/* size in pages */
+static inline pgoff_t size_in_pages(loff_t size)
+{
+ return size_in_blocks(size, PAGE_SHIFT);
+}
+
+/* size in logical clusters */
+static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
+{
+ return size_in_blocks(size, inode_cluster_shift(inode));
+}
+
+/* size in pages to the size in page clusters */
+static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
+{
+ return size_in_blocks(size, cluster_nrpages_shift(inode));
+}
+
+/*********************** Size modulators ***************************/
+
+/*
+ Modulate linear size by nominated block size and offset.
+
+ The "finite" function (which is zero almost everywhere).
+ How much is a height of the figure at a position @pos,
+ when trying to construct rectangle of height (1 << @blkbits),
+ and square @size.
+
+ ******
+ *******
+ *******
+ *******
+ ----------> pos
+*/
+static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
+{
+ unsigned end = size >> blkbits;
+ if (pos < end)
+ return 1U << blkbits;
+ if (unlikely(pos > end))
+ return 0;
+ return size & ~(~0ull << blkbits);
+}
+
+/* the same as above, but block size is page size */
+static inline unsigned __mbp(loff_t size, pgoff_t pos)
+{
+ return __mbb(size, pos, PAGE_SHIFT);
+}
+
+/* number of file's bytes in the nominated logical cluster */
+static inline unsigned lbytes(cloff_t index, struct inode *inode)
+{
+ return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
+}
+
+/* number of file's bytes in the nominated page */
+static inline unsigned pbytes(pgoff_t index, struct inode *inode)
+{
+ return __mbp(i_size_read(inode), index);
+}
+
+/**
+ * number of pages occuped by @win->count bytes starting from
+ * @win->off at logical cluster defined by @win. This is exactly
+ * a number of pages to be modified and dirtied in any cluster operation.
+ */
+static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
+{
+ return ((win->off + win->count +
+ (1UL << PAGE_SHIFT) - 1) >> PAGE_SHIFT) -
+ off_to_pg(win->off);
+}
+
+/* return true, if logical cluster is not occupied by the file */
+static inline int new_logical_cluster(struct cluster_handle *clust,
+ struct inode *inode)
+{
+ return clust_to_off(clust->index, inode) >= i_size_read(inode);
+}
+
+/* return true, if pages @p1 and @p2 are of the same page cluster */
+static inline int same_page_cluster(struct page *p1, struct page *p2)
+{
+ assert("edward-1490", p1 != NULL);
+ assert("edward-1491", p2 != NULL);
+ assert("edward-1492", p1->mapping != NULL);
+ assert("edward-1493", p2->mapping != NULL);
+
+ return (pg_to_clust(page_index(p1), p1->mapping->host) ==
+ pg_to_clust(page_index(p2), p2->mapping->host));
+}
+
+static inline int cluster_is_complete(struct cluster_handle *clust,
+ struct inode *inode)
+{
+ return clust->tc.lsize == inode_cluster_size(inode);
+}
+
+static inline void reiser4_slide_init(struct reiser4_slide *win)
+{
+ assert("edward-1084", win != NULL);
+ memset(win, 0, sizeof *win);
+}
+
+static inline tfm_action
+cluster_get_tfm_act(struct tfm_cluster *tc)
+{
+ assert("edward-1356", tc != NULL);
+ return tc->act;
+}
+
+static inline void
+cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act)
+{
+ assert("edward-1356", tc != NULL);
+ tc->act = act;
+}
+
+static inline void cluster_init_act(struct cluster_handle *clust,
+ tfm_action act,
+ struct reiser4_slide *window)
+{
+ assert("edward-84", clust != NULL);
+ memset(clust, 0, sizeof *clust);
+ cluster_set_tfm_act(&clust->tc, act);
+ clust->dstat = INVAL_DISK_CLUSTER;
+ clust->win = window;
+}
+
+static inline void cluster_init_read(struct cluster_handle *clust,
+ struct reiser4_slide *window)
+{
+ cluster_init_act(clust, TFMA_READ, window);
+}
+
+static inline void cluster_init_write(struct cluster_handle *clust,
+ struct reiser4_slide *window)
+{
+ cluster_init_act(clust, TFMA_WRITE, window);
+}
+
+/* true if @p1 and @p2 are items of the same disk cluster */
+static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2)
+{
+ /* drop this if you have other items to aggregate */
+ assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
+
+ return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
+}
+
+static inline int dclust_get_extension_dsize(hint_t *hint)
+{
+ return hint->ext_coord.extension.ctail.dsize;
+}
+
+static inline void dclust_set_extension_dsize(hint_t *hint, int dsize)
+{
+ hint->ext_coord.extension.ctail.dsize = dsize;
+}
+
+static inline int dclust_get_extension_shift(hint_t *hint)
+{
+ return hint->ext_coord.extension.ctail.shift;
+}
+
+static inline int dclust_get_extension_ncount(hint_t *hint)
+{
+ return hint->ext_coord.extension.ctail.ncount;
+}
+
+static inline void dclust_inc_extension_ncount(hint_t *hint)
+{
+ hint->ext_coord.extension.ctail.ncount++;
+}
+
+static inline void dclust_init_extension(hint_t *hint)
+{
+ memset(&hint->ext_coord.extension.ctail, 0,
+ sizeof(hint->ext_coord.extension.ctail));
+}
+
+static inline int hint_is_unprepped_dclust(hint_t *hint)
+{
+ assert("edward-1451", hint_is_valid(hint));
+ return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
+}
+
+static inline void coord_set_between_clusters(coord_t *coord)
+{
+#if REISER4_DEBUG
+ int result;
+ result = zload(coord->node);
+ assert("edward-1296", !result);
+#endif
+ if (!coord_is_between_items(coord)) {
+ coord->between = AFTER_ITEM;
+ coord->unit_pos = 0;
+ }
+#if REISER4_DEBUG
+ zrelse(coord->node);
+#endif
+}
+
+int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
+int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
+ znode_lock_mode mode);
+int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *);
+int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
+void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
+ int even_cows);
+void invalidate_hint_cluster(struct cluster_handle *clust);
+int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode,
+ znode_lock_mode lock_mode);
+void reset_cluster_params(struct cluster_handle *clust);
+int set_cluster_by_page(struct cluster_handle *clust, struct page *page,
+ int count);
+int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust,
+ rw_op rw);
+void __put_page_cluster(int from, int count, struct page **pages,
+ struct inode *inode);
+void put_page_cluster(struct cluster_handle *clust,
+ struct inode *inode, rw_op rw);
+void put_cluster_handle(struct cluster_handle *clust);
+int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc,
+ tfm_stream_id id);
+int tfm_cluster_is_uptodate(struct tfm_cluster *tc);
+void tfm_cluster_set_uptodate(struct tfm_cluster *tc);
+void tfm_cluster_clr_uptodate(struct tfm_cluster *tc);
+
+/* move cluster handle to the target position
+ specified by the page of index @pgidx */
+static inline void move_cluster_forward(struct cluster_handle *clust,
+ struct inode *inode,
+ pgoff_t pgidx)
+{
+ assert("edward-1297", clust != NULL);
+ assert("edward-1298", inode != NULL);
+
+ reset_cluster_params(clust);
+ if (clust->index_valid &&
+ /* Hole in the indices. Hint became invalid and can not be
+ used by find_cluster_item() even if seal/node versions
+ will coincide */
+ pg_to_clust(pgidx, inode) != clust->index + 1) {
+ reiser4_unset_hint(clust->hint);
+ invalidate_hint_cluster(clust);
+ }
+ clust->index = pg_to_clust(pgidx, inode);
+ clust->index_valid = 1;
+}
+
+static inline int alloc_clust_pages(struct cluster_handle *clust,
+ struct inode *inode)
+{
+ assert("edward-791", clust != NULL);
+ assert("edward-792", inode != NULL);
+ clust->pages =
+ kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
+ reiser4_ctx_gfp_mask_get());
+ if (!clust->pages)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void free_clust_pages(struct cluster_handle *clust)
+{
+ kfree(clust->pages);
+}
+
+#endif /* __FS_REISER4_CLUSTER_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/compress.c linux-5.10.2/fs/reiser4/plugin/compress/compress.c
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/compress.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/compress.c 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,521 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* reiser4 compression transform plugins */
+
+#include "../../debug.h"
+#include "../../inode.h"
+#include "../plugin.h"
+
+#include <linux/lzo.h>
+#include <linux/zstd.h>
+#include <linux/zlib.h>
+#include <linux/types.h>
+#include <linux/hardirq.h>
+
+static int change_compression(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ assert("edward-1316", inode != NULL);
+ assert("edward-1317", plugin != NULL);
+ assert("edward-1318", is_reiser4_inode(inode));
+ assert("edward-1319",
+ plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
+
+ /* cannot change compression plugin of already existing regular object */
+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
+ return RETERR(-EINVAL);
+
+ /* If matches, nothing to change. */
+ if (inode_hash_plugin(inode) != NULL &&
+ inode_hash_plugin(inode)->h.id == plugin->h.id)
+ return 0;
+
+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
+ PSET_COMPRESSION, plugin);
+}
+
+static reiser4_plugin_ops compression_plugin_ops = {
+ .init = NULL,
+ .load = NULL,
+ .save_len = NULL,
+ .save = NULL,
+ .change = &change_compression
+};
+
+/******************************************************************************/
+/* gzip1 compression */
+/******************************************************************************/
+
+#define GZIP1_DEF_LEVEL Z_BEST_SPEED
+#define GZIP1_DEF_WINBITS 15
+#define GZIP1_DEF_MEMLEVEL MAX_MEM_LEVEL
+
+static int gzip1_init(void)
+{
+ return 0;
+}
+
+static int gzip1_overrun(unsigned src_len UNUSED_ARG)
+{
+ return 0;
+}
+
+static coa_t gzip1_alloc(tfm_action act)
+{
+ coa_t coa = NULL;
+ int ret = 0;
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ coa = reiser4_vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+ MAX_MEM_LEVEL));
+ if (!coa) {
+ ret = -ENOMEM;
+ break;
+ }
+ break;
+ case TFMA_READ: /* decompress */
+ coa = reiser4_vmalloc(zlib_inflate_workspacesize());
+ if (!coa) {
+ ret = -ENOMEM;
+ break;
+ }
+ break;
+ default:
+ impossible("edward-767", "unknown tfm action");
+ }
+ if (ret)
+ return ERR_PTR(ret);
+ return coa;
+}
+
+static void gzip1_free(coa_t coa, tfm_action act)
+{
+ assert("edward-769", coa != NULL);
+
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ vfree(coa);
+ break;
+ case TFMA_READ: /* decompress */
+ vfree(coa);
+ break;
+ default:
+ impossible("edward-770", "unknown tfm action");
+ }
+ return;
+}
+
+static int gzip1_min_size_deflate(void)
+{
+ return 64;
+}
+
+static void
+gzip1_compress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ int ret = 0;
+ struct z_stream_s stream;
+
+ assert("edward-842", coa != NULL);
+ assert("edward-875", src_len != 0);
+
+ stream.workspace = coa;
+ ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
+ -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
+ Z_DEFAULT_STRATEGY);
+ if (ret != Z_OK) {
+ warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
+ goto rollback;
+ }
+ ret = zlib_deflateReset(&stream);
+ if (ret != Z_OK) {
+ warning("edward-772", "zlib_deflateReset returned %d\n", ret);
+ goto rollback;
+ }
+ stream.next_in = src_first;
+ stream.avail_in = src_len;
+ stream.next_out = dst_first;
+ stream.avail_out = *dst_len;
+
+ ret = zlib_deflate(&stream, Z_FINISH);
+ if (ret != Z_STREAM_END) {
+ if (ret != Z_OK)
+ warning("edward-773",
+ "zlib_deflate returned %d\n", ret);
+ goto rollback;
+ }
+ *dst_len = stream.total_out;
+ return;
+ rollback:
+ *dst_len = src_len;
+ return;
+}
+
+static void
+gzip1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ int ret = 0;
+ struct z_stream_s stream;
+
+ assert("edward-843", coa != NULL);
+ assert("edward-876", src_len != 0);
+
+ stream.workspace = coa;
+ ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
+ if (ret != Z_OK) {
+ warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
+ return;
+ }
+ ret = zlib_inflateReset(&stream);
+ if (ret != Z_OK) {
+ warning("edward-775", "zlib_inflateReset returned %d\n", ret);
+ return;
+ }
+
+ stream.next_in = src_first;
+ stream.avail_in = src_len;
+ stream.next_out = dst_first;
+ stream.avail_out = *dst_len;
+
+ ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
+ /*
+ * Work around a bug in zlib, which sometimes wants to taste an extra
+ * byte when being used in the (undocumented) raw deflate mode.
+ * (From USAGI).
+ */
+ if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
+ u8 zerostuff = 0;
+ stream.next_in = &zerostuff;
+ stream.avail_in = 1;
+ ret = zlib_inflate(&stream, Z_FINISH);
+ }
+ if (ret != Z_STREAM_END) {
+ warning("edward-776", "zlib_inflate returned %d\n", ret);
+ return;
+ }
+ *dst_len = stream.total_out;
+ return;
+}
+
+/******************************************************************************/
+/* lzo1 compression */
+/******************************************************************************/
+
+static int lzo1_init(void)
+{
+ return 0;
+}
+
+static int lzo1_overrun(unsigned in_len)
+{
+ return in_len / 16 + 64 + 3;
+}
+
+static coa_t lzo1_alloc(tfm_action act)
+{
+ int ret = 0;
+ coa_t coa = NULL;
+
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
+ if (!coa) {
+ ret = -ENOMEM;
+ break;
+ }
+ case TFMA_READ: /* decompress */
+ break;
+ default:
+ impossible("edward-877", "unknown tfm action");
+ }
+ if (ret)
+ return ERR_PTR(ret);
+ return coa;
+}
+
+static void lzo1_free(coa_t coa, tfm_action act)
+{
+ assert("edward-879", coa != NULL);
+
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ vfree(coa);
+ break;
+ case TFMA_READ: /* decompress */
+ impossible("edward-1304",
+ "trying to free non-allocated workspace");
+ default:
+ impossible("edward-880", "unknown tfm action");
+ }
+ return;
+}
+
+static int lzo1_min_size_deflate(void)
+{
+ return 256;
+}
+
+static void
+lzo1_compress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ int result;
+
+ assert("edward-846", coa != NULL);
+ assert("edward-847", src_len != 0);
+
+ result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
+ if (unlikely(result != LZO_E_OK)) {
+ warning("edward-849", "lzo1x_1_compress failed\n");
+ goto out;
+ }
+ if (*dst_len >= src_len) {
+ //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
+ goto out;
+ }
+ return;
+ out:
+ *dst_len = src_len;
+ return;
+}
+
+static void
+lzo1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ int result;
+
+ assert("edward-851", coa == NULL);
+ assert("edward-852", src_len != 0);
+
+ result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
+ if (result != LZO_E_OK)
+ warning("edward-853", "lzo1x_1_decompress failed\n");
+ return;
+}
+
+/******************************************************************************/
+/* zstd1 compression */
+/******************************************************************************/
+
+typedef struct {
+ ZSTD_parameters params;
+ void* workspace;
+ ZSTD_CCtx* cctx;
+} zstd1_coa_c;
+typedef struct {
+ void* workspace;
+ ZSTD_DCtx* dctx;
+} zstd1_coa_d;
+
+static int zstd1_init(void)
+{
+ return 0;
+}
+
+static int zstd1_overrun(unsigned src_len UNUSED_ARG)
+{
+ return ZSTD_compressBound(src_len) - src_len;
+}
+
+static coa_t zstd1_alloc(tfm_action act)
+{
+ int ret = 0;
+ size_t workspace_size;
+ coa_t coa = NULL;
+
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ coa = reiser4_vmalloc(sizeof(zstd1_coa_c));
+ if (!coa) {
+ ret = -ENOMEM;
+ break;
+ }
+ /* ZSTD benchmark use level 1 as default. Max is 22. */
+ ((zstd1_coa_c*)coa)->params = ZSTD_getParams(1, 0, 0);
+ workspace_size = ZSTD_CCtxWorkspaceBound(((zstd1_coa_c*)coa)->params.cParams);
+ ((zstd1_coa_c*)coa)->workspace = reiser4_vmalloc(workspace_size);
+ if (!(((zstd1_coa_c*)coa)->workspace)) {
+ ret = -ENOMEM;
+ vfree(coa);
+ break;
+ }
+ ((zstd1_coa_c*)coa)->cctx = ZSTD_initCCtx(((zstd1_coa_c*)coa)->workspace, workspace_size);
+ if (!(((zstd1_coa_c*)coa)->cctx)) {
+ ret = -ENOMEM;
+ vfree(((zstd1_coa_c*)coa)->workspace);
+ vfree(coa);
+ break;
+ }
+ break;
+ case TFMA_READ: /* decompress */
+ coa = reiser4_vmalloc(sizeof(zstd1_coa_d));
+ if (!coa) {
+ ret = -ENOMEM;
+ break;
+ }
+ workspace_size = ZSTD_DCtxWorkspaceBound();
+ ((zstd1_coa_d*)coa)->workspace = reiser4_vmalloc(workspace_size);
+ if (!(((zstd1_coa_d*)coa)->workspace)) {
+ ret = -ENOMEM;
+ vfree(coa);
+ break;
+ }
+ ((zstd1_coa_d*)coa)->dctx = ZSTD_initDCtx(((zstd1_coa_d*)coa)->workspace, workspace_size);
+ if (!(((zstd1_coa_d*)coa)->dctx)) {
+ ret = -ENOMEM;
+ vfree(((zstd1_coa_d*)coa)->workspace);
+ vfree(coa);
+ break;
+ }
+ break;
+ default:
+ impossible("bsinot-1",
+ "trying to alloc workspace for unknown tfm action");
+ }
+ if (ret) {
+ warning("bsinot-2",
+ "alloc workspace for zstd (tfm action = %d) failed\n",
+ act);
+ return ERR_PTR(ret);
+ }
+ return coa;
+}
+
+static void zstd1_free(coa_t coa, tfm_action act)
+{
+ assert("bsinot-3", coa != NULL);
+
+ switch (act) {
+ case TFMA_WRITE: /* compress */
+ vfree(((zstd1_coa_c*)coa)->workspace);
+ vfree(coa);
+ //printk(KERN_WARNING "free comp memory -- %p\n", coa);
+ break;
+ case TFMA_READ: /* decompress */
+ vfree(((zstd1_coa_d*)coa)->workspace);
+ vfree(coa);
+ //printk(KERN_WARNING "free decomp memory -- %p\n", coa);
+ break;
+ default:
+ impossible("bsinot-4", "unknown tfm action");
+ }
+ return;
+}
+
+static int zstd1_min_size_deflate(void)
+{
+ return 256; /* I'm not sure about the correct value, so took from LZO1 */
+}
+
+static void
+zstd1_compress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ unsigned int result;
+
+ assert("bsinot-5", coa != NULL);
+ assert("bsinot-6", src_len != 0);
+ result = ZSTD_compressCCtx(((zstd1_coa_c*)coa)->cctx, dst_first, *dst_len, src_first, src_len, ((zstd1_coa_c*)coa)->params);
+ if (ZSTD_isError(result)) {
+ warning("bsinot-7", "zstd1_compressCCtx failed\n");
+ goto out;
+ }
+ *dst_len = result;
+ if (*dst_len >= src_len) {
+ //warning("bsinot-8", "zstd1_compressCCtx: incompressible data\n");
+ goto out;
+ }
+ return;
+ out:
+ *dst_len = src_len;
+ return;
+}
+
+static void
+zstd1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
+ __u8 * dst_first, size_t *dst_len)
+{
+ unsigned int result;
+
+ assert("bsinot-9", coa != NULL);
+ assert("bsinot-10", src_len != 0);
+
+ result = ZSTD_decompressDCtx(((zstd1_coa_d*)coa)->dctx, dst_first, *dst_len, src_first, src_len);
+ /* Same here. */
+ if (ZSTD_isError(result))
+ warning("bsinot-11", "zstd1_decompressDCtx failed\n");
+ *dst_len = result;
+ return;
+}
+
+
+compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
+ [LZO1_COMPRESSION_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .id = LZO1_COMPRESSION_ID,
+ .pops = &compression_plugin_ops,
+ .label = "lzo1",
+ .desc = "lzo1 compression transform",
+ .linkage = {NULL, NULL}
+ },
+ .init = lzo1_init,
+ .overrun = lzo1_overrun,
+ .alloc = lzo1_alloc,
+ .free = lzo1_free,
+ .min_size_deflate = lzo1_min_size_deflate,
+ .checksum = reiser4_adler32,
+ .compress = lzo1_compress,
+ .decompress = lzo1_decompress
+ },
+ [GZIP1_COMPRESSION_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .id = GZIP1_COMPRESSION_ID,
+ .pops = &compression_plugin_ops,
+ .label = "gzip1",
+ .desc = "gzip1 compression transform",
+ .linkage = {NULL, NULL}
+ },
+ .init = gzip1_init,
+ .overrun = gzip1_overrun,
+ .alloc = gzip1_alloc,
+ .free = gzip1_free,
+ .min_size_deflate = gzip1_min_size_deflate,
+ .checksum = reiser4_adler32,
+ .compress = gzip1_compress,
+ .decompress = gzip1_decompress
+ },
+ [ZSTD1_COMPRESSION_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .id = ZSTD1_COMPRESSION_ID,
+ .pops = &compression_plugin_ops,
+ .label = "zstd1",
+ .desc = "zstd1 compression transform",
+ .linkage = {NULL, NULL}
+ },
+ .init = zstd1_init,
+ .overrun = zstd1_overrun,
+ .alloc = zstd1_alloc,
+ .free = zstd1_free,
+ .min_size_deflate = zstd1_min_size_deflate,
+ .checksum = reiser4_adler32,
+ .compress = zstd1_compress,
+ .decompress = zstd1_decompress
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/compress.h linux-5.10.2/fs/reiser4/plugin/compress/compress.h
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/compress.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/compress.h 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,44 @@
+#if !defined( __FS_REISER4_COMPRESS_H__ )
+#define __FS_REISER4_COMPRESS_H__
+
+#include <linux/types.h>
+#include <linux/string.h>
+
+/* transform direction */
+typedef enum {
+ TFMA_READ, /* decrypt, decompress */
+ TFMA_WRITE, /* encrypt, compress */
+ TFMA_LAST
+} tfm_action;
+
+/* supported compression algorithms */
+typedef enum {
+ LZO1_COMPRESSION_ID,
+ GZIP1_COMPRESSION_ID,
+ ZSTD1_COMPRESSION_ID,
+ LAST_COMPRESSION_ID,
+} reiser4_compression_id;
+
+/* the same as pgoff, but units are page clusters */
+typedef unsigned long cloff_t;
+
+/* working data of a (de)compression algorithm */
+typedef void *coa_t;
+
+/* table for all supported (de)compression algorithms */
+typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
+
+__u32 reiser4_adler32(char *data, __u32 len);
+
+#endif /* __FS_REISER4_COMPRESS_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/compress_mode.c linux-5.10.2/fs/reiser4/plugin/compress/compress_mode.c
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/compress_mode.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/compress_mode.c 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,162 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* This file contains Reiser4 compression mode plugins.
+
+ Compression mode plugin is a set of handlers called by compressor
+ at flush time and represent some heuristics including the ones
+ which are to avoid compression of incompressible data, see
+ http://www.namesys.com/cryptcompress_design.html for more details.
+*/
+#include "../../inode.h"
+#include "../plugin.h"
+
+static int should_deflate_none(struct inode * inode, cloff_t index)
+{
+ return 0;
+}
+
+static int should_deflate_common(struct inode * inode, cloff_t index)
+{
+ return compression_is_on(cryptcompress_inode_data(inode));
+}
+
+static int discard_hook_ultim(struct inode *inode, cloff_t index)
+{
+ turn_off_compression(cryptcompress_inode_data(inode));
+ return 0;
+}
+
+static int discard_hook_lattd(struct inode *inode, cloff_t index)
+{
+ struct cryptcompress_info * info = cryptcompress_inode_data(inode);
+
+ assert("edward-1462",
+ get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
+ get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
+
+ turn_off_compression(info);
+ if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
+ set_lattice_factor(info, get_lattice_factor(info) << 1);
+ return 0;
+}
+
+static int accept_hook_lattd(struct inode *inode, cloff_t index)
+{
+ turn_on_compression(cryptcompress_inode_data(inode));
+ set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
+ return 0;
+}
+
+/* Check on dynamic lattice, the adaptive compression modes which
+ defines the following behavior:
+
+ Compression is on: try to compress everything and turn
+ it off, whenever cluster is incompressible.
+
+ Compression is off: try to compress clusters of indexes
+ k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
+ them is compressible. If incompressible, then increase FACTOR */
+
+/* check if @index belongs to one-dimensional lattice
+ of sparce factor @factor */
+static int is_on_lattice(cloff_t index, int factor)
+{
+ return (factor ? index % factor == 0: index == 0);
+}
+
+static int should_deflate_lattd(struct inode * inode, cloff_t index)
+{
+ return should_deflate_common(inode, index) ||
+ is_on_lattice(index,
+ get_lattice_factor
+ (cryptcompress_inode_data(inode)));
+}
+
+/* compression mode_plugins */
+compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
+ [NONE_COMPRESSION_MODE_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = NONE_COMPRESSION_MODE_ID,
+ .pops = NULL,
+ .label = "none",
+ .desc = "Compress nothing",
+ .linkage = {NULL, NULL}
+ },
+ .should_deflate = should_deflate_none,
+ .accept_hook = NULL,
+ .discard_hook = NULL
+ },
+ /* Check-on-dynamic-lattice adaptive compression mode */
+ [LATTD_COMPRESSION_MODE_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = LATTD_COMPRESSION_MODE_ID,
+ .pops = NULL,
+ .label = "lattd",
+ .desc = "Check on dynamic lattice",
+ .linkage = {NULL, NULL}
+ },
+ .should_deflate = should_deflate_lattd,
+ .accept_hook = accept_hook_lattd,
+ .discard_hook = discard_hook_lattd
+ },
+ /* Check-ultimately compression mode:
+ Turn off compression forever as soon as we meet
+ incompressible data */
+ [ULTIM_COMPRESSION_MODE_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = ULTIM_COMPRESSION_MODE_ID,
+ .pops = NULL,
+ .label = "ultim",
+ .desc = "Check ultimately",
+ .linkage = {NULL, NULL}
+ },
+ .should_deflate = should_deflate_common,
+ .accept_hook = NULL,
+ .discard_hook = discard_hook_ultim
+ },
+ /* Force-to-compress-everything compression mode */
+ [FORCE_COMPRESSION_MODE_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = FORCE_COMPRESSION_MODE_ID,
+ .pops = NULL,
+ .label = "force",
+ .desc = "Force to compress everything",
+ .linkage = {NULL, NULL}
+ },
+ .should_deflate = NULL,
+ .accept_hook = NULL,
+ .discard_hook = NULL
+ },
+ /* Convert-to-extent compression mode.
+ In this mode items will be converted to extents and management
+ will be passed to (classic) unix file plugin as soon as ->write()
+ detects that the first complete logical cluster (of index #0) is
+ incompressible. */
+ [CONVX_COMPRESSION_MODE_ID] = {
+ .h = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .id = CONVX_COMPRESSION_MODE_ID,
+ .pops = NULL,
+ .label = "conv",
+ .desc = "Convert to extent",
+ .linkage = {NULL, NULL}
+ },
+ .should_deflate = should_deflate_common,
+ .accept_hook = NULL,
+ .discard_hook = NULL
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/lzoconf.h linux-5.10.2/fs/reiser4/plugin/compress/lzoconf.h
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/lzoconf.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/lzoconf.h 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,216 @@
+/* lzoconf.h -- configuration for the LZO real-time data compression library
+ adopted for reiser4 compression transform plugin.
+
+ This file is part of the LZO real-time data compression library
+ and not included in any proprietary licenses of reiser4.
+
+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
+ All Rights Reserved.
+
+ The LZO library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ The LZO library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with the LZO library; see the file COPYING.
+ If not, write to the Free Software Foundation, Inc.,
+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ Markus F.X.J. Oberhumer
+ <markus@oberhumer.com>
+ http://www.oberhumer.com/opensource/lzo/
+ */
+
+#include <linux/kernel.h> /* for UINT_MAX, ULONG_MAX - edward */
+
+#ifndef __LZOCONF_H
+#define __LZOCONF_H
+
+#define LZO_VERSION 0x1080
+#define LZO_VERSION_STRING "1.08"
+#define LZO_VERSION_DATE "Jul 12 2002"
+
+/* internal Autoconf configuration file - only used when building LZO */
+
+/***********************************************************************
+// LZO requires a conforming <limits.h>
+************************************************************************/
+
+#define CHAR_BIT 8
+#define USHRT_MAX 0xffff
+
+/* workaround a cpp bug under hpux 10.20 */
+#define LZO_0xffffffffL 4294967295ul
+
+/***********************************************************************
+// architecture defines
+************************************************************************/
+
+#if !defined(__LZO_i386)
+# if defined(__i386__) || defined(__386__) || defined(_M_IX86)
+# define __LZO_i386
+# endif
+#endif
+
+/* memory checkers */
+#if !defined(__LZO_CHECKER)
+# if defined(__BOUNDS_CHECKING_ON)
+# define __LZO_CHECKER
+# elif defined(__CHECKER__)
+# define __LZO_CHECKER
+# elif defined(__INSURE__)
+# define __LZO_CHECKER
+# elif defined(__PURIFY__)
+# define __LZO_CHECKER
+# endif
+#endif
+
+/***********************************************************************
+// integral and pointer types
+************************************************************************/
+
+/* Integral types with 32 bits or more */
+#if !defined(LZO_UINT32_MAX)
+# if (UINT_MAX >= LZO_0xffffffffL)
+ typedef unsigned int lzo_uint32;
+ typedef int lzo_int32;
+# define LZO_UINT32_MAX UINT_MAX
+# define LZO_INT32_MAX INT_MAX
+# define LZO_INT32_MIN INT_MIN
+# elif (ULONG_MAX >= LZO_0xffffffffL)
+ typedef unsigned long lzo_uint32;
+ typedef long lzo_int32;
+# define LZO_UINT32_MAX ULONG_MAX
+# define LZO_INT32_MAX LONG_MAX
+# define LZO_INT32_MIN LONG_MIN
+# else
+# error "lzo_uint32"
+# endif
+#endif
+
+/* lzo_uint is used like size_t */
+#if !defined(LZO_UINT_MAX)
+# if (UINT_MAX >= LZO_0xffffffffL)
+ typedef unsigned int lzo_uint;
+ typedef int lzo_int;
+# define LZO_UINT_MAX UINT_MAX
+# define LZO_INT_MAX INT_MAX
+# define LZO_INT_MIN INT_MIN
+# elif (ULONG_MAX >= LZO_0xffffffffL)
+ typedef unsigned long lzo_uint;
+ typedef long lzo_int;
+# define LZO_UINT_MAX ULONG_MAX
+# define LZO_INT_MAX LONG_MAX
+# define LZO_INT_MIN LONG_MIN
+# else
+# error "lzo_uint"
+# endif
+#endif
+
+ typedef int lzo_bool;
+
+/***********************************************************************
+// memory models
+************************************************************************/
+
+/* Memory model that allows to access memory at offsets of lzo_uint. */
+#if !defined(__LZO_MMODEL)
+# if (LZO_UINT_MAX <= UINT_MAX)
+# define __LZO_MMODEL
+# else
+# error "__LZO_MMODEL"
+# endif
+#endif
+
+/* no typedef here because of const-pointer issues */
+#define lzo_byte unsigned char __LZO_MMODEL
+#define lzo_bytep unsigned char __LZO_MMODEL *
+#define lzo_charp char __LZO_MMODEL *
+#define lzo_voidp void __LZO_MMODEL *
+#define lzo_shortp short __LZO_MMODEL *
+#define lzo_ushortp unsigned short __LZO_MMODEL *
+#define lzo_uint32p lzo_uint32 __LZO_MMODEL *
+#define lzo_int32p lzo_int32 __LZO_MMODEL *
+#define lzo_uintp lzo_uint __LZO_MMODEL *
+#define lzo_intp lzo_int __LZO_MMODEL *
+#define lzo_voidpp lzo_voidp __LZO_MMODEL *
+#define lzo_bytepp lzo_bytep __LZO_MMODEL *
+
+#ifndef lzo_sizeof_dict_t
+# define lzo_sizeof_dict_t sizeof(lzo_bytep)
+#endif
+
+typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
+ lzo_byte * dst, lzo_uintp dst_len,
+ lzo_voidp wrkmem);
+
+
+/***********************************************************************
+// error codes and prototypes
+************************************************************************/
+
+/* Error codes for the compression/decompression functions. Negative
+ * values are errors, positive values will be used for special but
+ * normal events.
+ */
+#define LZO_E_OK 0
+#define LZO_E_ERROR (-1)
+#define LZO_E_OUT_OF_MEMORY (-2) /* not used right now */
+#define LZO_E_NOT_COMPRESSIBLE (-3) /* not used right now */
+#define LZO_E_INPUT_OVERRUN (-4)
+#define LZO_E_OUTPUT_OVERRUN (-5)
+#define LZO_E_LOOKBEHIND_OVERRUN (-6)
+#define LZO_E_EOF_NOT_FOUND (-7)
+#define LZO_E_INPUT_NOT_CONSUMED (-8)
+
+/* lzo_init() should be the first function you call.
+ * Check the return code !
+ *
+ * lzo_init() is a macro to allow checking that the library and the
+ * compiler's view of various types are consistent.
+ */
+#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
+ (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
+ (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
+ (int)sizeof(lzo_compress_t))
+ extern int __lzo_init2(unsigned, int, int, int, int, int, int,
+ int, int, int);
+
+/* checksum functions */
+extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
+ lzo_uint _len);
+/* misc. */
+ typedef union {
+ lzo_bytep p;
+ lzo_uint u;
+ } __lzo_pu_u;
+ typedef union {
+ lzo_bytep p;
+ lzo_uint32 u32;
+ } __lzo_pu32_u;
+ typedef union {
+ void *vp;
+ lzo_bytep bp;
+ lzo_uint32 u32;
+ long l;
+ } lzo_align_t;
+
+#define LZO_PTR_ALIGN_UP(_ptr,_size) \
+ ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
+
+/* deprecated - only for backward compatibility */
+#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
+
+#endif /* already included */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/Makefile linux-5.10.2/fs/reiser4/plugin/compress/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/Makefile 2020-12-23 16:07:46.120813158 +0100
@@ -0,0 +1,5 @@
+obj-$(CONFIG_REISER4_FS) += compress_plugins.o
+
+compress_plugins-objs := \
+ compress.o \
+ compress_mode.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/minilzo.c linux-5.10.2/fs/reiser4/plugin/compress/minilzo.c
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/minilzo.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/minilzo.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,1967 @@
+/* minilzo.c -- mini subset of the LZO real-time data compression library
+ adopted for reiser4 compression transform plugin.
+
+ This file is part of the LZO real-time data compression library
+ and not included in any proprietary licenses of reiser4.
+
+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
+ All Rights Reserved.
+
+ The LZO library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ The LZO library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with the LZO library; see the file COPYING.
+ If not, write to the Free Software Foundation, Inc.,
+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ Markus F.X.J. Oberhumer
+ <markus@oberhumer.com>
+ http://www.oberhumer.com/opensource/lzo/
+ */
+
+/*
+ * NOTE:
+ * the full LZO package can be found at
+ * http://www.oberhumer.com/opensource/lzo/
+ */
+
+#include "../../debug.h" /* for reiser4 assert macro -edward */
+
+#define __LZO_IN_MINILZO
+#define LZO_BUILD
+
+#include "minilzo.h"
+
+#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
+# error "version mismatch in miniLZO source files"
+#endif
+
+#ifndef __LZO_CONF_H
+#define __LZO_CONF_H
+
+# define BOUNDS_CHECKING_OFF_DURING(stmt) stmt
+# define BOUNDS_CHECKING_OFF_IN_EXPR(expr) (expr)
+
+# define HAVE_MEMCMP
+# define HAVE_MEMCPY
+# define HAVE_MEMMOVE
+# define HAVE_MEMSET
+
+#undef NDEBUG
+#if !defined(LZO_DEBUG)
+# define NDEBUG
+#endif
+#if defined(LZO_DEBUG) || !defined(NDEBUG)
+# if !defined(NO_STDIO_H)
+# include <stdio.h>
+# endif
+#endif
+
+#if !defined(LZO_COMPILE_TIME_ASSERT)
+# define LZO_COMPILE_TIME_ASSERT(expr) \
+ { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
+#endif
+
+#if !defined(LZO_UNUSED)
+# if 1
+# define LZO_UNUSED(var) ((void)&var)
+# elif 0
+# define LZO_UNUSED(var) { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
+# else
+# define LZO_UNUSED(parm) (parm = parm)
+# endif
+#endif
+
+#if defined(NO_MEMCMP)
+# undef HAVE_MEMCMP
+#endif
+
+#if !defined(HAVE_MEMSET)
+# undef memset
+# define memset lzo_memset
+#endif
+
+# define LZO_BYTE(x) ((unsigned char) ((x) & 0xff))
+
+#define LZO_MAX(a,b) ((a) >= (b) ? (a) : (b))
+#define LZO_MIN(a,b) ((a) <= (b) ? (a) : (b))
+#define LZO_MAX3(a,b,c) ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
+#define LZO_MIN3(a,b,c) ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
+
+#define lzo_sizeof(type) ((lzo_uint) (sizeof(type)))
+
+#define LZO_HIGH(array) ((lzo_uint) (sizeof(array)/sizeof(*(array))))
+
+#define LZO_SIZE(bits) (1u << (bits))
+#define LZO_MASK(bits) (LZO_SIZE(bits) - 1)
+
+#define LZO_LSIZE(bits) (1ul << (bits))
+#define LZO_LMASK(bits) (LZO_LSIZE(bits) - 1)
+
+#define LZO_USIZE(bits) ((lzo_uint) 1 << (bits))
+#define LZO_UMASK(bits) (LZO_USIZE(bits) - 1)
+
+#define LZO_STYPE_MAX(b) (((1l << (8*(b)-2)) - 1l) + (1l << (8*(b)-2)))
+#define LZO_UTYPE_MAX(b) (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
+
+#if !defined(SIZEOF_UNSIGNED)
+# if (UINT_MAX == 0xffff)
+# define SIZEOF_UNSIGNED 2
+# elif (UINT_MAX == LZO_0xffffffffL)
+# define SIZEOF_UNSIGNED 4
+# elif (UINT_MAX >= LZO_0xffffffffL)
+# define SIZEOF_UNSIGNED 8
+# else
+# error "SIZEOF_UNSIGNED"
+# endif
+#endif
+
+#if !defined(SIZEOF_UNSIGNED_LONG)
+# if (ULONG_MAX == LZO_0xffffffffL)
+# define SIZEOF_UNSIGNED_LONG 4
+# elif (ULONG_MAX >= LZO_0xffffffffL)
+# define SIZEOF_UNSIGNED_LONG 8
+# else
+# error "SIZEOF_UNSIGNED_LONG"
+# endif
+#endif
+
+#if !defined(SIZEOF_SIZE_T)
+# define SIZEOF_SIZE_T SIZEOF_UNSIGNED
+#endif
+#if !defined(SIZE_T_MAX)
+# define SIZE_T_MAX LZO_UTYPE_MAX(SIZEOF_SIZE_T)
+#endif
+
+#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
+# if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
+# define LZO_UNALIGNED_OK_2
+# endif
+# if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
+# define LZO_UNALIGNED_OK_4
+# endif
+#endif
+
+#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
+# if !defined(LZO_UNALIGNED_OK)
+# define LZO_UNALIGNED_OK
+# endif
+#endif
+
+#if defined(__LZO_NO_UNALIGNED)
+# undef LZO_UNALIGNED_OK
+# undef LZO_UNALIGNED_OK_2
+# undef LZO_UNALIGNED_OK_4
+#endif
+
+#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
+# error "LZO_UNALIGNED_OK_2 must not be defined on this system"
+#endif
+#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
+# error "LZO_UNALIGNED_OK_4 must not be defined on this system"
+#endif
+
+#if defined(__LZO_NO_ALIGNED)
+# undef LZO_ALIGNED_OK_4
+#endif
+
+#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
+# error "LZO_ALIGNED_OK_4 must not be defined on this system"
+#endif
+
+#define LZO_LITTLE_ENDIAN 1234
+#define LZO_BIG_ENDIAN 4321
+#define LZO_PDP_ENDIAN 3412
+
+#if !defined(LZO_BYTE_ORDER)
+# if defined(MFX_BYTE_ORDER)
+# define LZO_BYTE_ORDER MFX_BYTE_ORDER
+# elif defined(__LZO_i386)
+# define LZO_BYTE_ORDER LZO_LITTLE_ENDIAN
+# elif defined(BYTE_ORDER)
+# define LZO_BYTE_ORDER BYTE_ORDER
+# elif defined(__BYTE_ORDER)
+# define LZO_BYTE_ORDER __BYTE_ORDER
+# endif
+#endif
+
+#if defined(LZO_BYTE_ORDER)
+# if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
+ (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
+# error "invalid LZO_BYTE_ORDER"
+# endif
+#endif
+
+#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
+# error "LZO_BYTE_ORDER is not defined"
+#endif
+
+#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
+
+#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
+# if defined(__GNUC__) && defined(__i386__)
+# if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
+# define LZO_OPTIMIZE_GNUC_i386
+# endif
+# endif
+#endif
+
+extern const lzo_uint32 _lzo_crc32_table[256];
+
+#define _LZO_STRINGIZE(x) #x
+#define _LZO_MEXPAND(x) _LZO_STRINGIZE(x)
+
+#define _LZO_CONCAT2(a,b) a ## b
+#define _LZO_CONCAT3(a,b,c) a ## b ## c
+#define _LZO_CONCAT4(a,b,c,d) a ## b ## c ## d
+#define _LZO_CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e
+
+#define _LZO_ECONCAT2(a,b) _LZO_CONCAT2(a,b)
+#define _LZO_ECONCAT3(a,b,c) _LZO_CONCAT3(a,b,c)
+#define _LZO_ECONCAT4(a,b,c,d) _LZO_CONCAT4(a,b,c,d)
+#define _LZO_ECONCAT5(a,b,c,d,e) _LZO_CONCAT5(a,b,c,d,e)
+
+#ifndef __LZO_PTR_H
+#define __LZO_PTR_H
+
+#if !defined(lzo_ptrdiff_t)
+# if (UINT_MAX >= LZO_0xffffffffL)
+typedef ptrdiff_t lzo_ptrdiff_t;
+# else
+typedef long lzo_ptrdiff_t;
+# endif
+#endif
+
+#if !defined(__LZO_HAVE_PTR_T)
+# if defined(lzo_ptr_t)
+# define __LZO_HAVE_PTR_T
+# endif
+#endif
+#if !defined(__LZO_HAVE_PTR_T)
+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
+typedef unsigned long lzo_ptr_t;
+typedef long lzo_sptr_t;
+# define __LZO_HAVE_PTR_T
+# endif
+# endif
+#endif
+#if !defined(__LZO_HAVE_PTR_T)
+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
+typedef unsigned int lzo_ptr_t;
+typedef int lzo_sptr_t;
+# define __LZO_HAVE_PTR_T
+# endif
+# endif
+#endif
+#if !defined(__LZO_HAVE_PTR_T)
+# if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
+# if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
+typedef unsigned short lzo_ptr_t;
+typedef short lzo_sptr_t;
+# define __LZO_HAVE_PTR_T
+# endif
+# endif
+#endif
+#if !defined(__LZO_HAVE_PTR_T)
+# if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
+# error "no suitable type for lzo_ptr_t"
+# else
+typedef unsigned long lzo_ptr_t;
+typedef long lzo_sptr_t;
+# define __LZO_HAVE_PTR_T
+# endif
+#endif
+
+#define PTR(a) ((lzo_ptr_t) (a))
+#define PTR_LINEAR(a) PTR(a)
+#define PTR_ALIGNED_4(a) ((PTR_LINEAR(a) & 3) == 0)
+#define PTR_ALIGNED_8(a) ((PTR_LINEAR(a) & 7) == 0)
+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
+
+#define PTR_LT(a,b) (PTR(a) < PTR(b))
+#define PTR_GE(a,b) (PTR(a) >= PTR(b))
+#define PTR_DIFF(a,b) ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
+#define pd(a,b) ((lzo_uint) ((a)-(b)))
+
+typedef union {
+ char a_char;
+ unsigned char a_uchar;
+ short a_short;
+ unsigned short a_ushort;
+ int a_int;
+ unsigned int a_uint;
+ long a_long;
+ unsigned long a_ulong;
+ lzo_int a_lzo_int;
+ lzo_uint a_lzo_uint;
+ lzo_int32 a_lzo_int32;
+ lzo_uint32 a_lzo_uint32;
+ ptrdiff_t a_ptrdiff_t;
+ lzo_ptrdiff_t a_lzo_ptrdiff_t;
+ lzo_ptr_t a_lzo_ptr_t;
+ lzo_voidp a_lzo_voidp;
+ void *a_void_p;
+ lzo_bytep a_lzo_bytep;
+ lzo_bytepp a_lzo_bytepp;
+ lzo_uintp a_lzo_uintp;
+ lzo_uint *a_lzo_uint_p;
+ lzo_uint32p a_lzo_uint32p;
+ lzo_uint32 *a_lzo_uint32_p;
+ unsigned char *a_uchar_p;
+ char *a_char_p;
+} lzo_full_align_t;
+
+#endif
+#define LZO_DETERMINISTIC
+#define LZO_DICT_USE_PTR
+# define lzo_dict_t const lzo_bytep
+# define lzo_dict_p lzo_dict_t __LZO_MMODEL *
+#if !defined(lzo_moff_t)
+#define lzo_moff_t lzo_uint
+#endif
+#endif
+static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
+{
+ return PTR_LINEAR(ptr);
+}
+
+static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
+{
+ lzo_ptr_t p, s, n;
+
+ assert("lzo-01", size > 0);
+
+ p = __lzo_ptr_linear(ptr);
+ s = (lzo_ptr_t) (size - 1);
+ n = (((p + s) / size) * size) - p;
+
+ assert("lzo-02", (long)n >= 0);
+ assert("lzo-03", n <= s);
+
+ return (unsigned)n;
+}
+
+#ifndef __LZO_UTIL_H
+#define __LZO_UTIL_H
+
+#ifndef __LZO_CONF_H
+#endif
+
+#if 1 && defined(HAVE_MEMCPY)
+#define MEMCPY8_DS(dest,src,len) \
+ memcpy(dest,src,len); \
+ dest += len; \
+ src += len
+#endif
+
+#if !defined(MEMCPY8_DS)
+
+#define MEMCPY8_DS(dest,src,len) \
+ { register lzo_uint __l = (len) / 8; \
+ do { \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ *dest++ = *src++; \
+ } while (--__l > 0); }
+
+#endif
+
+#define MEMCPY_DS(dest,src,len) \
+ do *dest++ = *src++; \
+ while (--len > 0)
+
+#define MEMMOVE_DS(dest,src,len) \
+ do *dest++ = *src++; \
+ while (--len > 0)
+
+#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
+
+#define BZERO8_PTR(s,l,n) memset((s),0,(lzo_uint)(l)*(n))
+
+#else
+
+#define BZERO8_PTR(s,l,n) \
+ lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
+
+#endif
+#endif
+
+/* If you use the LZO library in a product, you *must* keep this
+ * copyright string in the executable of your product.
+ */
+
+static const lzo_byte __lzo_copyright[] =
+#if !defined(__LZO_IN_MINLZO)
+ LZO_VERSION_STRING;
+#else
+ "\n\n\n"
+ "LZO real-time data compression library.\n"
+ "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
+ "<markus.oberhumer@jk.uni-linz.ac.at>\n"
+ "http://www.oberhumer.com/opensource/lzo/\n"
+ "\n"
+ "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
+ "LZO build date: " __DATE__ " " __TIME__ "\n\n"
+ "LZO special compilation options:\n"
+#ifdef __cplusplus
+ " __cplusplus\n"
+#endif
+#if defined(__PIC__)
+ " __PIC__\n"
+#elif defined(__pic__)
+ " __pic__\n"
+#endif
+#if (UINT_MAX < LZO_0xffffffffL)
+ " 16BIT\n"
+#endif
+#if defined(__LZO_STRICT_16BIT)
+ " __LZO_STRICT_16BIT\n"
+#endif
+#if (UINT_MAX > LZO_0xffffffffL)
+ " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
+#endif
+#if (ULONG_MAX > LZO_0xffffffffL)
+ " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
+#endif
+#if defined(LZO_BYTE_ORDER)
+ " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
+#endif
+#if defined(LZO_UNALIGNED_OK_2)
+ " LZO_UNALIGNED_OK_2\n"
+#endif
+#if defined(LZO_UNALIGNED_OK_4)
+ " LZO_UNALIGNED_OK_4\n"
+#endif
+#if defined(LZO_ALIGNED_OK_4)
+ " LZO_ALIGNED_OK_4\n"
+#endif
+#if defined(LZO_DICT_USE_PTR)
+ " LZO_DICT_USE_PTR\n"
+#endif
+#if defined(__LZO_QUERY_COMPRESS)
+ " __LZO_QUERY_COMPRESS\n"
+#endif
+#if defined(__LZO_QUERY_DECOMPRESS)
+ " __LZO_QUERY_DECOMPRESS\n"
+#endif
+#if defined(__LZO_IN_MINILZO)
+ " __LZO_IN_MINILZO\n"
+#endif
+ "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
+#if defined(__GNUC__) && defined(__VERSION__)
+ " by gcc " __VERSION__
+#elif defined(__BORLANDC__)
+ " by Borland C " _LZO_MEXPAND(__BORLANDC__)
+#elif defined(_MSC_VER)
+ " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
+#elif defined(__PUREC__)
+ " by Pure C " _LZO_MEXPAND(__PUREC__)
+#elif defined(__SC__)
+ " by Symantec C " _LZO_MEXPAND(__SC__)
+#elif defined(__TURBOC__)
+ " by Turbo C " _LZO_MEXPAND(__TURBOC__)
+#elif defined(__WATCOMC__)
+ " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
+#endif
+ " $\n"
+ "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
+#endif
+
+#define LZO_BASE 65521u
+#define LZO_NMAX 5552
+
+#define LZO_DO1(buf,i) {s1 += buf[i]; s2 += s1;}
+#define LZO_DO2(buf,i) LZO_DO1(buf,i); LZO_DO1(buf,i+1);
+#define LZO_DO4(buf,i) LZO_DO2(buf,i); LZO_DO2(buf,i+2);
+#define LZO_DO8(buf,i) LZO_DO4(buf,i); LZO_DO4(buf,i+4);
+#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
+
+# define IS_SIGNED(type) (((type) (-1)) < ((type) 0))
+# define IS_UNSIGNED(type) (((type) (-1)) > ((type) 0))
+
+#define IS_POWER_OF_2(x) (((x) & ((x) - 1)) == 0)
+
+static lzo_bool schedule_insns_bug(void);
+static lzo_bool strength_reduce_bug(int *);
+
+# define __lzo_assert(x) ((x) ? 1 : 0)
+
+#undef COMPILE_TIME_ASSERT
+
+# define COMPILE_TIME_ASSERT(expr) LZO_COMPILE_TIME_ASSERT(expr)
+
+static lzo_bool basic_integral_check(void)
+{
+ lzo_bool r = 1;
+
+ COMPILE_TIME_ASSERT(CHAR_BIT == 8);
+ COMPILE_TIME_ASSERT(sizeof(char) == 1);
+ COMPILE_TIME_ASSERT(sizeof(short) >= 2);
+ COMPILE_TIME_ASSERT(sizeof(long) >= 4);
+ COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
+ COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
+
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
+
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
+#if defined(__LZO_STRICT_16BIT)
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
+#else
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
+#endif
+
+#if (USHRT_MAX == 65535u)
+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
+#elif (USHRT_MAX == LZO_0xffffffffL)
+ COMPILE_TIME_ASSERT(sizeof(short) == 4);
+#elif (USHRT_MAX >= LZO_0xffffffffL)
+ COMPILE_TIME_ASSERT(sizeof(short) > 4);
+#endif
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
+ COMPILE_TIME_ASSERT(IS_SIGNED(short));
+ COMPILE_TIME_ASSERT(IS_SIGNED(int));
+ COMPILE_TIME_ASSERT(IS_SIGNED(long));
+
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
+
+ COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
+ COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
+ COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
+ COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
+ COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
+ COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
+ LZO_UTYPE_MAX(sizeof(lzo_uint32)));
+ COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
+
+ r &= __lzo_assert(LZO_BYTE(257) == 1);
+
+ return r;
+}
+
+static lzo_bool basic_ptr_check(void)
+{
+ lzo_bool r = 1;
+
+ COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
+ COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
+
+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
+ COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
+
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
+
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
+
+ COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
+ COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
+
+#if defined(SIZEOF_CHAR_P)
+ COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
+#endif
+#if defined(SIZEOF_PTRDIFF_T)
+ COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
+#endif
+
+ COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
+ COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
+ COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
+
+ return r;
+}
+
+static lzo_bool ptr_check(void)
+{
+ lzo_bool r = 1;
+ int i;
+ char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
+ lzo_bytep wrkmem;
+ lzo_bytepp dict;
+ unsigned char x[4 * sizeof(lzo_full_align_t)];
+ long d;
+ lzo_full_align_t a;
+ lzo_full_align_t u;
+
+ for (i = 0; i < (int)sizeof(x); i++)
+ x[i] = LZO_BYTE(i);
+
+ wrkmem =
+ LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
+
+ u.a_lzo_bytep = wrkmem;
+ dict = u.a_lzo_bytepp;
+
+ d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
+ r &= __lzo_assert(d >= 0);
+ r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
+
+ memset(&a, 0, sizeof(a));
+ r &= __lzo_assert(a.a_lzo_voidp == NULL);
+
+ memset(&a, 0xff, sizeof(a));
+ r &= __lzo_assert(a.a_ushort == USHRT_MAX);
+ r &= __lzo_assert(a.a_uint == UINT_MAX);
+ r &= __lzo_assert(a.a_ulong == ULONG_MAX);
+ r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
+ r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
+
+ if (r == 1) {
+ for (i = 0; i < 8; i++)
+ r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
+ (const
+ lzo_voidp)(&wrkmem[i *
+ sizeof(lzo_byte
+ *)]));
+ }
+
+ memset(&a, 0, sizeof(a));
+ r &= __lzo_assert(a.a_char_p == NULL);
+ r &= __lzo_assert(a.a_lzo_bytep == NULL);
+ r &= __lzo_assert(NULL == (void *)0);
+ if (r == 1) {
+ for (i = 0; i < 10; i++)
+ dict[i] = wrkmem;
+ BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
+ r &= __lzo_assert(dict[0] == wrkmem);
+ for (i = 1; i < 9; i++)
+ r &= __lzo_assert(dict[i] == NULL);
+ r &= __lzo_assert(dict[9] == wrkmem);
+ }
+
+ if (r == 1) {
+ unsigned k = 1;
+ const unsigned n = (unsigned)sizeof(lzo_uint32);
+ lzo_byte *p0;
+ lzo_byte *p1;
+
+ k += __lzo_align_gap(&x[k], n);
+ p0 = (lzo_bytep) & x[k];
+#if defined(PTR_LINEAR)
+ r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
+#else
+ r &= __lzo_assert(n == 4);
+ r &= __lzo_assert(PTR_ALIGNED_4(p0));
+#endif
+
+ r &= __lzo_assert(k >= 1);
+ p1 = (lzo_bytep) & x[1];
+ r &= __lzo_assert(PTR_GE(p0, p1));
+
+ r &= __lzo_assert(k < 1 + n);
+ p1 = (lzo_bytep) & x[1 + n];
+ r &= __lzo_assert(PTR_LT(p0, p1));
+
+ if (r == 1) {
+ lzo_uint32 v0, v1;
+
+ u.a_uchar_p = &x[k];
+ v0 = *u.a_lzo_uint32_p;
+ u.a_uchar_p = &x[k + n];
+ v1 = *u.a_lzo_uint32_p;
+
+ r &= __lzo_assert(v0 > 0);
+ r &= __lzo_assert(v1 > 0);
+ }
+ }
+
+ return r;
+}
+
+static int _lzo_config_check(void)
+{
+ lzo_bool r = 1;
+ int i;
+ union {
+ lzo_uint32 a;
+ unsigned short b;
+ lzo_uint32 aa[4];
+ unsigned char x[4 * sizeof(lzo_full_align_t)];
+ } u;
+
+ COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
+ COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
+ < 0);
+
+ r &= basic_integral_check();
+ r &= basic_ptr_check();
+ if (r != 1)
+ return LZO_E_ERROR;
+
+ u.a = 0;
+ u.b = 0;
+ for (i = 0; i < (int)sizeof(u.x); i++)
+ u.x[i] = LZO_BYTE(i);
+
+#if defined(LZO_BYTE_ORDER)
+ if (r == 1) {
+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
+ lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
+ unsigned short b = (unsigned short)(u.b & 0xffff);
+ r &= __lzo_assert(a == 0x03020100L);
+ r &= __lzo_assert(b == 0x0100);
+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
+ lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
+ unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
+ r &= __lzo_assert(a == 0x00010203L);
+ r &= __lzo_assert(b == 0x0001);
+# else
+# error "invalid LZO_BYTE_ORDER"
+# endif
+ }
+#endif
+
+#if defined(LZO_UNALIGNED_OK_2)
+ COMPILE_TIME_ASSERT(sizeof(short) == 2);
+ if (r == 1) {
+ unsigned short b[4];
+
+ for (i = 0; i < 4; i++)
+ b[i] = *(const unsigned short *)&u.x[i];
+
+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
+ r &= __lzo_assert(b[0] == 0x0100);
+ r &= __lzo_assert(b[1] == 0x0201);
+ r &= __lzo_assert(b[2] == 0x0302);
+ r &= __lzo_assert(b[3] == 0x0403);
+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
+ r &= __lzo_assert(b[0] == 0x0001);
+ r &= __lzo_assert(b[1] == 0x0102);
+ r &= __lzo_assert(b[2] == 0x0203);
+ r &= __lzo_assert(b[3] == 0x0304);
+# endif
+ }
+#endif
+
+#if defined(LZO_UNALIGNED_OK_4)
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
+ if (r == 1) {
+ lzo_uint32 a[4];
+
+ for (i = 0; i < 4; i++)
+ a[i] = *(const lzo_uint32 *)&u.x[i];
+
+# if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
+ r &= __lzo_assert(a[0] == 0x03020100L);
+ r &= __lzo_assert(a[1] == 0x04030201L);
+ r &= __lzo_assert(a[2] == 0x05040302L);
+ r &= __lzo_assert(a[3] == 0x06050403L);
+# elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
+ r &= __lzo_assert(a[0] == 0x00010203L);
+ r &= __lzo_assert(a[1] == 0x01020304L);
+ r &= __lzo_assert(a[2] == 0x02030405L);
+ r &= __lzo_assert(a[3] == 0x03040506L);
+# endif
+ }
+#endif
+
+#if defined(LZO_ALIGNED_OK_4)
+ COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
+#endif
+
+ COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
+
+ if (r == 1) {
+ r &= __lzo_assert(!schedule_insns_bug());
+ }
+
+ if (r == 1) {
+ static int x[3];
+ static unsigned xn = 3;
+ register unsigned j;
+
+ for (j = 0; j < xn; j++)
+ x[j] = (int)j - 3;
+ r &= __lzo_assert(!strength_reduce_bug(x));
+ }
+
+ if (r == 1) {
+ r &= ptr_check();
+ }
+
+ return r == 1 ? LZO_E_OK : LZO_E_ERROR;
+}
+
+static lzo_bool schedule_insns_bug(void)
+{
+#if defined(__LZO_CHECKER)
+ return 0;
+#else
+ const int clone[] = { 1, 2, 0 };
+ const int *q;
+ q = clone;
+ return (*q) ? 0 : 1;
+#endif
+}
+
+static lzo_bool strength_reduce_bug(int *x)
+{
+ return x[0] != -3 || x[1] != -2 || x[2] != -1;
+}
+
+#undef COMPILE_TIME_ASSERT
+
+int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
+ int s6, int s7, int s8, int s9)
+{
+ int r;
+
+ if (v == 0)
+ return LZO_E_ERROR;
+
+ r = (s1 == -1 || s1 == (int)sizeof(short)) &&
+ (s2 == -1 || s2 == (int)sizeof(int)) &&
+ (s3 == -1 || s3 == (int)sizeof(long)) &&
+ (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
+ (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
+ (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
+ (s7 == -1 || s7 == (int)sizeof(char *)) &&
+ (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
+ (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
+ if (!r)
+ return LZO_E_ERROR;
+
+ r = _lzo_config_check();
+ if (r != LZO_E_OK)
+ return r;
+
+ return r;
+}
+
+#define do_compress _lzo1x_1_do_compress
+
+#define LZO_NEED_DICT_H
+#define D_BITS 14
+#define D_INDEX1(d,p) d = DM((0x21*DX3(p,5,5,6)) >> 5)
+#define D_INDEX2(d,p) d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
+
+#ifndef __LZO_CONFIG1X_H
+#define __LZO_CONFIG1X_H
+
+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
+# define LZO1X
+#endif
+
+#define LZO_EOF_CODE
+#undef LZO_DETERMINISTIC
+
+#define M1_MAX_OFFSET 0x0400
+#ifndef M2_MAX_OFFSET
+#define M2_MAX_OFFSET 0x0800
+#endif
+#define M3_MAX_OFFSET 0x4000
+#define M4_MAX_OFFSET 0xbfff
+
+#define MX_MAX_OFFSET (M1_MAX_OFFSET + M2_MAX_OFFSET)
+
+#define M1_MIN_LEN 2
+#define M1_MAX_LEN 2
+#define M2_MIN_LEN 3
+#ifndef M2_MAX_LEN
+#define M2_MAX_LEN 8
+#endif
+#define M3_MIN_LEN 3
+#define M3_MAX_LEN 33
+#define M4_MIN_LEN 3
+#define M4_MAX_LEN 9
+
+#define M1_MARKER 0
+#define M2_MARKER 64
+#define M3_MARKER 32
+#define M4_MARKER 16
+
+#ifndef MIN_LOOKAHEAD
+#define MIN_LOOKAHEAD (M2_MAX_LEN + 1)
+#endif
+
+#if defined(LZO_NEED_DICT_H)
+
+#ifndef LZO_HASH
+#define LZO_HASH LZO_HASH_LZO_INCREMENTAL_B
+#endif
+#define DL_MIN_LEN M2_MIN_LEN
+
+#ifndef __LZO_DICT_H
+#define __LZO_DICT_H
+
+#if !defined(D_BITS) && defined(DBITS)
+# define D_BITS DBITS
+#endif
+#if !defined(D_BITS)
+# error "D_BITS is not defined"
+#endif
+#if (D_BITS < 16)
+# define D_SIZE LZO_SIZE(D_BITS)
+# define D_MASK LZO_MASK(D_BITS)
+#else
+# define D_SIZE LZO_USIZE(D_BITS)
+# define D_MASK LZO_UMASK(D_BITS)
+#endif
+#define D_HIGH ((D_MASK >> 1) + 1)
+
+#if !defined(DD_BITS)
+# define DD_BITS 0
+#endif
+#define DD_SIZE LZO_SIZE(DD_BITS)
+#define DD_MASK LZO_MASK(DD_BITS)
+
+#if !defined(DL_BITS)
+# define DL_BITS (D_BITS - DD_BITS)
+#endif
+#if (DL_BITS < 16)
+# define DL_SIZE LZO_SIZE(DL_BITS)
+# define DL_MASK LZO_MASK(DL_BITS)
+#else
+# define DL_SIZE LZO_USIZE(DL_BITS)
+# define DL_MASK LZO_UMASK(DL_BITS)
+#endif
+
+#if (D_BITS != DL_BITS + DD_BITS)
+# error "D_BITS does not match"
+#endif
+#if (D_BITS < 8 || D_BITS > 18)
+# error "invalid D_BITS"
+#endif
+#if (DL_BITS < 8 || DL_BITS > 20)
+# error "invalid DL_BITS"
+#endif
+#if (DD_BITS < 0 || DD_BITS > 6)
+# error "invalid DD_BITS"
+#endif
+
+#if !defined(DL_MIN_LEN)
+# define DL_MIN_LEN 3
+#endif
+#if !defined(DL_SHIFT)
+# define DL_SHIFT ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
+#endif
+
+#define LZO_HASH_GZIP 1
+#define LZO_HASH_GZIP_INCREMENTAL 2
+#define LZO_HASH_LZO_INCREMENTAL_A 3
+#define LZO_HASH_LZO_INCREMENTAL_B 4
+
+#if !defined(LZO_HASH)
+# error "choose a hashing strategy"
+#endif
+
+#if (DL_MIN_LEN == 3)
+# define _DV2_A(p,shift1,shift2) \
+ (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
+# define _DV2_B(p,shift1,shift2) \
+ (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
+# define _DV3_B(p,shift1,shift2,shift3) \
+ ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
+#elif (DL_MIN_LEN == 2)
+# define _DV2_A(p,shift1,shift2) \
+ (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
+# define _DV2_B(p,shift1,shift2) \
+ (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
+#else
+# error "invalid DL_MIN_LEN"
+#endif
+#define _DV_A(p,shift) _DV2_A(p,shift,shift)
+#define _DV_B(p,shift) _DV2_B(p,shift,shift)
+#define DA2(p,s1,s2) \
+ (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
+#define DS2(p,s1,s2) \
+ (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
+#define DX2(p,s1,s2) \
+ (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
+#define DMS(v,s) ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
+#define DM(v) DMS(v,0)
+
+#if (LZO_HASH == LZO_HASH_GZIP)
+# define _DINDEX(dv,p) (_DV_A((p),DL_SHIFT))
+
+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
+# define __LZO_HASH_INCREMENTAL
+# define DVAL_FIRST(dv,p) dv = _DV_A((p),DL_SHIFT)
+# define DVAL_NEXT(dv,p) dv = (((dv) << DL_SHIFT) ^ p[2])
+# define _DINDEX(dv,p) (dv)
+# define DVAL_LOOKAHEAD DL_MIN_LEN
+
+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
+# define __LZO_HASH_INCREMENTAL
+# define DVAL_FIRST(dv,p) dv = _DV_A((p),5)
+# define DVAL_NEXT(dv,p) \
+ dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
+# define DVAL_LOOKAHEAD DL_MIN_LEN
+
+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
+# define __LZO_HASH_INCREMENTAL
+# define DVAL_FIRST(dv,p) dv = _DV_B((p),5)
+# define DVAL_NEXT(dv,p) \
+ dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
+# define _DINDEX(dv,p) ((0x9f5f * (dv)) >> 5)
+# define DVAL_LOOKAHEAD DL_MIN_LEN
+
+#else
+# error "choose a hashing strategy"
+#endif
+
+#ifndef DINDEX
+#define DINDEX(dv,p) ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
+#endif
+#if !defined(DINDEX1) && defined(D_INDEX1)
+#define DINDEX1 D_INDEX1
+#endif
+#if !defined(DINDEX2) && defined(D_INDEX2)
+#define DINDEX2 D_INDEX2
+#endif
+
+#if !defined(__LZO_HASH_INCREMENTAL)
+# define DVAL_FIRST(dv,p) ((void) 0)
+# define DVAL_NEXT(dv,p) ((void) 0)
+# define DVAL_LOOKAHEAD 0
+#endif
+
+#if !defined(DVAL_ASSERT)
+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
+static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
+{
+ lzo_uint32 df;
+ DVAL_FIRST(df, (p));
+ assert(DINDEX(dv, p) == DINDEX(df, p));
+}
+#else
+# define DVAL_ASSERT(dv,p) ((void) 0)
+#endif
+#endif
+
+# define DENTRY(p,in) (p)
+# define GINDEX(m_pos,m_off,dict,dindex,in) m_pos = dict[dindex]
+
+#if (DD_BITS == 0)
+
+# define UPDATE_D(dict,drun,dv,p,in) dict[ DINDEX(dv,p) ] = DENTRY(p,in)
+# define UPDATE_I(dict,drun,index,p,in) dict[index] = DENTRY(p,in)
+# define UPDATE_P(ptr,drun,p,in) (ptr)[0] = DENTRY(p,in)
+
+#else
+
+# define UPDATE_D(dict,drun,dv,p,in) \
+ dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
+# define UPDATE_I(dict,drun,index,p,in) \
+ dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
+# define UPDATE_P(ptr,drun,p,in) \
+ (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
+
+#endif
+
+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
+ (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
+
+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
+ (BOUNDS_CHECKING_OFF_IN_EXPR( \
+ (PTR_LT(m_pos,in) || \
+ (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
+ m_off > max_offset) ))
+
+#if defined(LZO_DETERMINISTIC)
+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_DET
+#else
+# define LZO_CHECK_MPOS LZO_CHECK_MPOS_NON_DET
+#endif
+#endif
+#endif
+#endif
+#define DO_COMPRESS lzo1x_1_compress
+static
+lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
+{
+ register const lzo_byte *ip;
+ lzo_byte *op;
+ const lzo_byte *const in_end = in + in_len;
+ const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
+ const lzo_byte *ii;
+ lzo_dict_p const dict = (lzo_dict_p) wrkmem;
+
+ op = out;
+ ip = in;
+ ii = ip;
+
+ ip += 4;
+ for (;;) {
+ register const lzo_byte *m_pos;
+
+ lzo_moff_t m_off;
+ lzo_uint m_len;
+ lzo_uint dindex;
+
+ DINDEX1(dindex, ip);
+ GINDEX(m_pos, m_off, dict, dindex, in);
+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
+ goto literal;
+#if 1
+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+ goto try_match;
+ DINDEX2(dindex, ip);
+#endif
+ GINDEX(m_pos, m_off, dict, dindex, in);
+ if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
+ goto literal;
+ if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+ goto try_match;
+ goto literal;
+
+ try_match:
+#if 1 && defined(LZO_UNALIGNED_OK_2)
+ if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
+#else
+ if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
+#endif
+ ;
+ } else {
+ if (m_pos[2] == ip[2]) {
+ goto match;
+ } else {
+ ;
+ }
+ }
+
+ literal:
+ UPDATE_I(dict, 0, dindex, ip, in);
+ ++ip;
+ if (ip >= ip_end)
+ break;
+ continue;
+
+ match:
+ UPDATE_I(dict, 0, dindex, ip, in);
+ if (pd(ip, ii) > 0) {
+ register lzo_uint t = pd(ip, ii);
+
+ if (t <= 3) {
+ assert("lzo-04", op - 2 > out);
+ op[-2] |= LZO_BYTE(t);
+ } else if (t <= 18)
+ *op++ = LZO_BYTE(t - 3);
+ else {
+ register lzo_uint tt = t - 18;
+
+ *op++ = 0;
+ while (tt > 255) {
+ tt -= 255;
+ *op++ = 0;
+ }
+ assert("lzo-05", tt > 0);
+ *op++ = LZO_BYTE(tt);
+ }
+ do
+ *op++ = *ii++;
+ while (--t > 0);
+ }
+
+ assert("lzo-06", ii == ip);
+ ip += 3;
+ if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
+ || m_pos[6] != *ip++ || m_pos[7] != *ip++
+ || m_pos[8] != *ip++
+#ifdef LZO1Y
+ || m_pos[9] != *ip++ || m_pos[10] != *ip++
+ || m_pos[11] != *ip++ || m_pos[12] != *ip++
+ || m_pos[13] != *ip++ || m_pos[14] != *ip++
+#endif
+ ) {
+ --ip;
+ m_len = ip - ii;
+ assert("lzo-07", m_len >= 3);
+ assert("lzo-08", m_len <= M2_MAX_LEN);
+
+ if (m_off <= M2_MAX_OFFSET) {
+ m_off -= 1;
+#if defined(LZO1X)
+ *op++ =
+ LZO_BYTE(((m_len -
+ 1) << 5) | ((m_off & 7) << 2));
+ *op++ = LZO_BYTE(m_off >> 3);
+#elif defined(LZO1Y)
+ *op++ =
+ LZO_BYTE(((m_len +
+ 1) << 4) | ((m_off & 3) << 2));
+ *op++ = LZO_BYTE(m_off >> 2);
+#endif
+ } else if (m_off <= M3_MAX_OFFSET) {
+ m_off -= 1;
+ *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
+ goto m3_m4_offset;
+ } else
+#if defined(LZO1X)
+ {
+ m_off -= 0x4000;
+ assert("lzo-09", m_off > 0);
+ assert("lzo-10", m_off <= 0x7fff);
+ *op++ = LZO_BYTE(M4_MARKER |
+ ((m_off & 0x4000) >> 11) |
+ (m_len - 2));
+ goto m3_m4_offset;
+ }
+#elif defined(LZO1Y)
+ goto m4_match;
+#endif
+ } else {
+ {
+ const lzo_byte *end = in_end;
+ const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
+ while (ip < end && *m == *ip)
+ m++, ip++;
+ m_len = (ip - ii);
+ }
+ assert("lzo-11", m_len > M2_MAX_LEN);
+
+ if (m_off <= M3_MAX_OFFSET) {
+ m_off -= 1;
+ if (m_len <= 33)
+ *op++ =
+ LZO_BYTE(M3_MARKER | (m_len - 2));
+ else {
+ m_len -= 33;
+ *op++ = M3_MARKER | 0;
+ goto m3_m4_len;
+ }
+ } else {
+#if defined(LZO1Y)
+ m4_match:
+#endif
+ m_off -= 0x4000;
+ assert("lzo-12", m_off > 0);
+ assert("lzo-13", m_off <= 0x7fff);
+ if (m_len <= M4_MAX_LEN)
+ *op++ = LZO_BYTE(M4_MARKER |
+ ((m_off & 0x4000) >>
+ 11) | (m_len - 2));
+ else {
+ m_len -= M4_MAX_LEN;
+ *op++ =
+ LZO_BYTE(M4_MARKER |
+ ((m_off & 0x4000) >> 11));
+ m3_m4_len:
+ while (m_len > 255) {
+ m_len -= 255;
+ *op++ = 0;
+ }
+ assert("lzo-14", m_len > 0);
+ *op++ = LZO_BYTE(m_len);
+ }
+ }
+
+ m3_m4_offset:
+ *op++ = LZO_BYTE((m_off & 63) << 2);
+ *op++ = LZO_BYTE(m_off >> 6);
+ }
+
+ ii = ip;
+ if (ip >= ip_end)
+ break;
+ }
+
+ *out_len = op - out;
+ return pd(in_end, ii);
+}
+
+int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
+{
+ lzo_byte *op = out;
+ lzo_uint t;
+
+#if defined(__LZO_QUERY_COMPRESS)
+ if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
+ return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
+ D_SIZE, lzo_sizeof(lzo_dict_t));
+#endif
+
+ if (in_len <= M2_MAX_LEN + 5)
+ t = in_len;
+ else {
+ t = do_compress(in, in_len, op, out_len, wrkmem);
+ op += *out_len;
+ }
+
+ if (t > 0) {
+ const lzo_byte *ii = in + in_len - t;
+
+ if (op == out && t <= 238)
+ *op++ = LZO_BYTE(17 + t);
+ else if (t <= 3)
+ op[-2] |= LZO_BYTE(t);
+ else if (t <= 18)
+ *op++ = LZO_BYTE(t - 3);
+ else {
+ lzo_uint tt = t - 18;
+
+ *op++ = 0;
+ while (tt > 255) {
+ tt -= 255;
+ *op++ = 0;
+ }
+ assert("lzo-15", tt > 0);
+ *op++ = LZO_BYTE(tt);
+ }
+ do
+ *op++ = *ii++;
+ while (--t > 0);
+ }
+
+ *op++ = M4_MARKER | 1;
+ *op++ = 0;
+ *op++ = 0;
+
+ *out_len = op - out;
+ return LZO_E_OK;
+}
+
+#undef do_compress
+#undef DO_COMPRESS
+#undef LZO_HASH
+
+#undef LZO_TEST_DECOMPRESS_OVERRUN
+#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
+#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
+#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
+#undef DO_DECOMPRESS
+#define DO_DECOMPRESS lzo1x_decompress
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
+# endif
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
+# endif
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
+# endif
+#endif
+
+#undef TEST_IP
+#undef TEST_OP
+#undef TEST_LOOKBEHIND
+#undef NEED_IP
+#undef NEED_OP
+#undef HAVE_TEST_IP
+#undef HAVE_TEST_OP
+#undef HAVE_NEED_IP
+#undef HAVE_NEED_OP
+#undef HAVE_ANY_IP
+#undef HAVE_ANY_OP
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
+# define TEST_IP (ip < ip_end)
+# endif
+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
+# define NEED_IP(x) \
+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
+# endif
+#endif
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
+# define TEST_OP (op <= op_end)
+# endif
+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
+# undef TEST_OP
+# define NEED_OP(x) \
+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
+# endif
+#endif
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
+#else
+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
+#endif
+
+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
+# define TEST_IP (ip < ip_end)
+#endif
+
+#if defined(TEST_IP)
+# define HAVE_TEST_IP
+#else
+# define TEST_IP 1
+#endif
+#if defined(TEST_OP)
+# define HAVE_TEST_OP
+#else
+# define TEST_OP 1
+#endif
+
+#if defined(NEED_IP)
+# define HAVE_NEED_IP
+#else
+# define NEED_IP(x) ((void) 0)
+#endif
+#if defined(NEED_OP)
+# define HAVE_NEED_OP
+#else
+# define NEED_OP(x) ((void) 0)
+#endif
+
+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
+# define HAVE_ANY_IP
+#endif
+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
+# define HAVE_ANY_OP
+#endif
+
+#undef __COPY4
+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
+
+#undef COPY4
+#if defined(LZO_UNALIGNED_OK_4)
+# define COPY4(dst,src) __COPY4(dst,src)
+#elif defined(LZO_ALIGNED_OK_4)
+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
+#endif
+
+#if defined(DO_DECOMPRESS)
+int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
+ lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
+#endif
+{
+ register lzo_byte *op;
+ register const lzo_byte *ip;
+ register lzo_uint t;
+#if defined(COPY_DICT)
+ lzo_uint m_off;
+ const lzo_byte *dict_end;
+#else
+ register const lzo_byte *m_pos;
+#endif
+
+ const lzo_byte *const ip_end = in + in_len;
+#if defined(HAVE_ANY_OP)
+ lzo_byte *const op_end = out + *out_len;
+#endif
+#if defined(LZO1Z)
+ lzo_uint last_m_off = 0;
+#endif
+
+ LZO_UNUSED(wrkmem);
+
+#if defined(__LZO_QUERY_DECOMPRESS)
+ if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
+ return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
+ 0, 0);
+#endif
+
+#if defined(COPY_DICT)
+ if (dict) {
+ if (dict_len > M4_MAX_OFFSET) {
+ dict += dict_len - M4_MAX_OFFSET;
+ dict_len = M4_MAX_OFFSET;
+ }
+ dict_end = dict + dict_len;
+ } else {
+ dict_len = 0;
+ dict_end = NULL;
+ }
+#endif
+
+ *out_len = 0;
+
+ op = out;
+ ip = in;
+
+ if (*ip > 17) {
+ t = *ip++ - 17;
+ if (t < 4)
+ goto match_next;
+ assert("lzo-16", t > 0);
+ NEED_OP(t);
+ NEED_IP(t + 1);
+ do
+ *op++ = *ip++;
+ while (--t > 0);
+ goto first_literal_run;
+ }
+
+ while (TEST_IP && TEST_OP) {
+ t = *ip++;
+ if (t >= 16)
+ goto match;
+ if (t == 0) {
+ NEED_IP(1);
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ NEED_IP(1);
+ }
+ t += 15 + *ip++;
+ }
+ assert("lzo-17", t > 0);
+ NEED_OP(t + 3);
+ NEED_IP(t + 4);
+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
+#if !defined(LZO_UNALIGNED_OK_4)
+ if (PTR_ALIGNED2_4(op, ip)) {
+#endif
+ COPY4(op, ip);
+ op += 4;
+ ip += 4;
+ if (--t > 0) {
+ if (t >= 4) {
+ do {
+ COPY4(op, ip);
+ op += 4;
+ ip += 4;
+ t -= 4;
+ } while (t >= 4);
+ if (t > 0)
+ do
+ *op++ = *ip++;
+ while (--t > 0);
+ } else
+ do
+ *op++ = *ip++;
+ while (--t > 0);
+ }
+#if !defined(LZO_UNALIGNED_OK_4)
+ } else
+#endif
+#endif
+#if !defined(LZO_UNALIGNED_OK_4)
+ {
+ *op++ = *ip++;
+ *op++ = *ip++;
+ *op++ = *ip++;
+ do
+ *op++ = *ip++;
+ while (--t > 0);
+ }
+#endif
+
+ first_literal_run:
+
+ t = *ip++;
+ if (t >= 16)
+ goto match;
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+ m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
+ last_m_off = m_off;
+#else
+ m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
+#endif
+ NEED_OP(3);
+ t = 3;
+ COPY_DICT(t, m_off)
+#else
+#if defined(LZO1Z)
+ t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
+ m_pos = op - t;
+ last_m_off = t;
+#else
+ m_pos = op - (1 + M2_MAX_OFFSET);
+ m_pos -= t >> 2;
+ m_pos -= *ip++ << 2;
+#endif
+ TEST_LOOKBEHIND(m_pos, out);
+ NEED_OP(3);
+ *op++ = *m_pos++;
+ *op++ = *m_pos++;
+ *op++ = *m_pos;
+#endif
+ goto match_done;
+
+ while (TEST_IP && TEST_OP) {
+ match:
+ if (t >= 64) {
+#if defined(COPY_DICT)
+#if defined(LZO1X)
+ m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
+ t = (t >> 5) - 1;
+#elif defined(LZO1Y)
+ m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
+ t = (t >> 4) - 3;
+#elif defined(LZO1Z)
+ m_off = t & 0x1f;
+ if (m_off >= 0x1c)
+ m_off = last_m_off;
+ else {
+ m_off = 1 + (m_off << 6) + (*ip++ >> 2);
+ last_m_off = m_off;
+ }
+ t = (t >> 5) - 1;
+#endif
+#else
+#if defined(LZO1X)
+ m_pos = op - 1;
+ m_pos -= (t >> 2) & 7;
+ m_pos -= *ip++ << 3;
+ t = (t >> 5) - 1;
+#elif defined(LZO1Y)
+ m_pos = op - 1;
+ m_pos -= (t >> 2) & 3;
+ m_pos -= *ip++ << 2;
+ t = (t >> 4) - 3;
+#elif defined(LZO1Z)
+ {
+ lzo_uint off = t & 0x1f;
+ m_pos = op;
+ if (off >= 0x1c) {
+ assert(last_m_off > 0);
+ m_pos -= last_m_off;
+ } else {
+ off =
+ 1 + (off << 6) +
+ (*ip++ >> 2);
+ m_pos -= off;
+ last_m_off = off;
+ }
+ }
+ t = (t >> 5) - 1;
+#endif
+ TEST_LOOKBEHIND(m_pos, out);
+ assert("lzo-18", t > 0);
+ NEED_OP(t + 3 - 1);
+ goto copy_match;
+#endif
+ } else if (t >= 32) {
+ t &= 31;
+ if (t == 0) {
+ NEED_IP(1);
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ NEED_IP(1);
+ }
+ t += 31 + *ip++;
+ }
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+ m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
+ last_m_off = m_off;
+#else
+ m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
+#endif
+#else
+#if defined(LZO1Z)
+ {
+ lzo_uint off =
+ 1 + (ip[0] << 6) + (ip[1] >> 2);
+ m_pos = op - off;
+ last_m_off = off;
+ }
+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
+ m_pos = op - 1;
+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
+#else
+ m_pos = op - 1;
+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
+#endif
+#endif
+ ip += 2;
+ } else if (t >= 16) {
+#if defined(COPY_DICT)
+ m_off = (t & 8) << 11;
+#else
+ m_pos = op;
+ m_pos -= (t & 8) << 11;
+#endif
+ t &= 7;
+ if (t == 0) {
+ NEED_IP(1);
+ while (*ip == 0) {
+ t += 255;
+ ip++;
+ NEED_IP(1);
+ }
+ t += 7 + *ip++;
+ }
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+ m_off += (ip[0] << 6) + (ip[1] >> 2);
+#else
+ m_off += (ip[0] >> 2) + (ip[1] << 6);
+#endif
+ ip += 2;
+ if (m_off == 0)
+ goto eof_found;
+ m_off += 0x4000;
+#if defined(LZO1Z)
+ last_m_off = m_off;
+#endif
+#else
+#if defined(LZO1Z)
+ m_pos -= (ip[0] << 6) + (ip[1] >> 2);
+#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
+ m_pos -= (*(const lzo_ushortp)ip) >> 2;
+#else
+ m_pos -= (ip[0] >> 2) + (ip[1] << 6);
+#endif
+ ip += 2;
+ if (m_pos == op)
+ goto eof_found;
+ m_pos -= 0x4000;
+#if defined(LZO1Z)
+ last_m_off = op - m_pos;
+#endif
+#endif
+ } else {
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+ m_off = 1 + (t << 6) + (*ip++ >> 2);
+ last_m_off = m_off;
+#else
+ m_off = 1 + (t >> 2) + (*ip++ << 2);
+#endif
+ NEED_OP(2);
+ t = 2;
+ COPY_DICT(t, m_off)
+#else
+#if defined(LZO1Z)
+ t = 1 + (t << 6) + (*ip++ >> 2);
+ m_pos = op - t;
+ last_m_off = t;
+#else
+ m_pos = op - 1;
+ m_pos -= t >> 2;
+ m_pos -= *ip++ << 2;
+#endif
+ TEST_LOOKBEHIND(m_pos, out);
+ NEED_OP(2);
+ *op++ = *m_pos++;
+ *op++ = *m_pos;
+#endif
+ goto match_done;
+ }
+
+#if defined(COPY_DICT)
+
+ NEED_OP(t + 3 - 1);
+ t += 3 - 1;
+ COPY_DICT(t, m_off)
+#else
+
+ TEST_LOOKBEHIND(m_pos, out);
+ assert("lzo-19", t > 0);
+ NEED_OP(t + 3 - 1);
+#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
+#if !defined(LZO_UNALIGNED_OK_4)
+ if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
+ assert((op - m_pos) >= 4);
+#else
+ if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
+#endif
+ COPY4(op, m_pos);
+ op += 4;
+ m_pos += 4;
+ t -= 4 - (3 - 1);
+ do {
+ COPY4(op, m_pos);
+ op += 4;
+ m_pos += 4;
+ t -= 4;
+ } while (t >= 4);
+ if (t > 0)
+ do
+ *op++ = *m_pos++;
+ while (--t > 0);
+ } else
+#endif
+ {
+ copy_match:
+ *op++ = *m_pos++;
+ *op++ = *m_pos++;
+ do
+ *op++ = *m_pos++;
+ while (--t > 0);
+ }
+
+#endif
+
+ match_done:
+#if defined(LZO1Z)
+ t = ip[-1] & 3;
+#else
+ t = ip[-2] & 3;
+#endif
+ if (t == 0)
+ break;
+
+ match_next:
+ assert("lzo-20", t > 0);
+ NEED_OP(t);
+ NEED_IP(t + 1);
+ do
+ *op++ = *ip++;
+ while (--t > 0);
+ t = *ip++;
+ }
+ }
+
+#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
+ *out_len = op - out;
+ return LZO_E_EOF_NOT_FOUND;
+#endif
+
+ eof_found:
+ assert("lzo-21", t == 1);
+ *out_len = op - out;
+ return (ip == ip_end ? LZO_E_OK :
+ (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
+
+#if defined(HAVE_NEED_IP)
+ input_overrun:
+ *out_len = op - out;
+ return LZO_E_INPUT_OVERRUN;
+#endif
+
+#if defined(HAVE_NEED_OP)
+ output_overrun:
+ *out_len = op - out;
+ return LZO_E_OUTPUT_OVERRUN;
+#endif
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
+ lookbehind_overrun:
+ *out_len = op - out;
+ return LZO_E_LOOKBEHIND_OVERRUN;
+#endif
+}
+
+#define LZO_TEST_DECOMPRESS_OVERRUN
+#undef DO_DECOMPRESS
+#define DO_DECOMPRESS lzo1x_decompress_safe
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
+# define LZO_TEST_DECOMPRESS_OVERRUN_INPUT 2
+# endif
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
+# define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT 2
+# endif
+# if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
+# define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
+# endif
+#endif
+
+#undef TEST_IP
+#undef TEST_OP
+#undef TEST_LOOKBEHIND
+#undef NEED_IP
+#undef NEED_OP
+#undef HAVE_TEST_IP
+#undef HAVE_TEST_OP
+#undef HAVE_NEED_IP
+#undef HAVE_NEED_OP
+#undef HAVE_ANY_IP
+#undef HAVE_ANY_OP
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
+# define TEST_IP (ip < ip_end)
+# endif
+# if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
+# define NEED_IP(x) \
+ if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x)) goto input_overrun
+# endif
+#endif
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
+# define TEST_OP (op <= op_end)
+# endif
+# if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
+# undef TEST_OP
+# define NEED_OP(x) \
+ if ((lzo_uint)(op_end - op) < (lzo_uint)(x)) goto output_overrun
+# endif
+#endif
+
+#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
+# define TEST_LOOKBEHIND(m_pos,out) if (m_pos < out) goto lookbehind_overrun
+#else
+# define TEST_LOOKBEHIND(m_pos,op) ((void) 0)
+#endif
+
+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
+# define TEST_IP (ip < ip_end)
+#endif
+
+#if defined(TEST_IP)
+# define HAVE_TEST_IP
+#else
+# define TEST_IP 1
+#endif
+#if defined(TEST_OP)
+# define HAVE_TEST_OP
+#else
+# define TEST_OP 1
+#endif
+
+#if defined(NEED_IP)
+# define HAVE_NEED_IP
+#else
+# define NEED_IP(x) ((void) 0)
+#endif
+#if defined(NEED_OP)
+# define HAVE_NEED_OP
+#else
+# define NEED_OP(x) ((void) 0)
+#endif
+
+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
+# define HAVE_ANY_IP
+#endif
+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
+# define HAVE_ANY_OP
+#endif
+
+#undef __COPY4
+#define __COPY4(dst,src) * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
+
+#undef COPY4
+#if defined(LZO_UNALIGNED_OK_4)
+# define COPY4(dst,src) __COPY4(dst,src)
+#elif defined(LZO_ALIGNED_OK_4)
+# define COPY4(dst,src) __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
+#endif
+
+/***** End of minilzo.c *****/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/compress/minilzo.h linux-5.10.2/fs/reiser4/plugin/compress/minilzo.h
--- linux-5.10.2.orig/fs/reiser4/plugin/compress/minilzo.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/compress/minilzo.h 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,70 @@
+/* minilzo.h -- mini subset of the LZO real-time data compression library
+ adopted for reiser4 compression transform plugin.
+
+ This file is part of the LZO real-time data compression library
+ and not included in any proprietary licenses of reiser4.
+
+ Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
+ Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
+ All Rights Reserved.
+
+ The LZO library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of
+ the License, or (at your option) any later version.
+
+ The LZO library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with the LZO library; see the file COPYING.
+ If not, write to the Free Software Foundation, Inc.,
+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ Markus F.X.J. Oberhumer
+ <markus@oberhumer.com>
+ http://www.oberhumer.com/opensource/lzo/
+ */
+
+/*
+ * NOTE:
+ * the full LZO package can be found at
+ * http://www.oberhumer.com/opensource/lzo/
+ */
+
+#ifndef __MINILZO_H
+#define __MINILZO_H
+
+#define MINILZO_VERSION 0x1080
+
+#include "lzoconf.h"
+
+/* Memory required for the wrkmem parameter.
+ * When the required size is 0, you can also pass a NULL pointer.
+ */
+
+#define LZO1X_MEM_COMPRESS LZO1X_1_MEM_COMPRESS
+#define LZO1X_1_MEM_COMPRESS ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
+#define LZO1X_MEM_DECOMPRESS (0)
+
+/* compression */
+extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
+ lzo_byte * dst, lzo_uintp dst_len,
+ lzo_voidp wrkmem);
+/* decompression */
+extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
+ lzo_byte * dst, lzo_uintp dst_len,
+ lzo_voidp wrkmem /* NOT USED */);
+/* safe decompression with overrun testing */
+extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
+ lzo_byte * dst, lzo_uintp dst_len,
+ lzo_voidp wrkmem /* NOT USED */ );
+
+#endif /* already included */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/crypto/cipher.c linux-5.10.2/fs/reiser4/plugin/crypto/cipher.c
--- linux-5.10.2.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/crypto/cipher.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,37 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser,
+ licensing governed by reiser4/README */
+/* Reiser4 cipher transform plugins */
+
+#include "../../debug.h"
+#include "../plugin.h"
+
+cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
+ [NONE_CIPHER_ID] = {
+ .h = {
+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
+ .id = NONE_CIPHER_ID,
+ .pops = NULL,
+ .label = "none",
+ .desc = "no cipher transform",
+ .linkage = {NULL, NULL}
+ },
+ .alloc = NULL,
+ .free = NULL,
+ .scale = NULL,
+ .align_stream = NULL,
+ .setkey = NULL,
+ .encrypt = NULL,
+ .decrypt = NULL
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/crypto/cipher.h linux-5.10.2/fs/reiser4/plugin/crypto/cipher.h
--- linux-5.10.2.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/crypto/cipher.h 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,55 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* This file contains definitions for the objects operated
+ by reiser4 key manager, which is something like keyring
+ wrapped by appropriate reiser4 plugin */
+
+#if !defined( __FS_REISER4_CRYPT_H__ )
+#define __FS_REISER4_CRYPT_H__
+
+#include <linux/crypto.h>
+
+/* key info imported from user space */
+struct reiser4_crypto_data {
+ int keysize; /* uninstantiated key size */
+ __u8 * key; /* uninstantiated key */
+ int keyid_size; /* size of passphrase */
+ __u8 * keyid; /* passphrase */
+};
+
+/* This object contains all needed infrastructure to implement
+ cipher transform. This is operated (allocating, inheriting,
+ validating, binding to host inode, etc..) by reiser4 key manager.
+
+ This info can be allocated in two cases:
+ 1. importing a key from user space.
+ 2. reading inode from disk */
+struct reiser4_crypto_info {
+ struct inode * host;
+ struct crypto_hash * digest;
+ struct crypto_blkcipher * cipher;
+#if 0
+ cipher_key_plugin * kplug; /* key manager */
+#endif
+ __u8 * keyid; /* key fingerprint, created by digest plugin,
+ using uninstantiated key and passphrase.
+ supposed to be stored in disk stat-data */
+ int inst; /* this indicates if the cipher key is
+ instantiated (case 1 above) */
+ int keysize; /* uninstantiated key size (bytes), supposed
+ to be stored in disk stat-data */
+ int keyload_count; /* number of the objects which has this
+ crypto-stat attached */
+};
+
+#endif /* __FS_REISER4_CRYPT_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/crypto/digest.c linux-5.10.2/fs/reiser4/plugin/crypto/digest.c
--- linux-5.10.2.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/crypto/digest.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,58 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
+/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
+#include "../../debug.h"
+#include "../plugin_header.h"
+#include "../plugin.h"
+#include "../file/cryptcompress.h"
+
+#include <linux/types.h>
+
+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
+
+static struct crypto_hash * alloc_sha256 (void)
+{
+#if REISER4_SHA256
+ return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
+#else
+ warning("edward-1418", "sha256 unsupported");
+ return ERR_PTR(-EINVAL);
+#endif
+}
+
+static void free_sha256 (struct crypto_hash * tfm)
+{
+#if REISER4_SHA256
+ crypto_free_hash(tfm);
+#endif
+ return;
+}
+
+/* digest plugins */
+digest_plugin digest_plugins[LAST_DIGEST_ID] = {
+ [SHA256_32_DIGEST_ID] = {
+ .h = {
+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
+ .id = SHA256_32_DIGEST_ID,
+ .pops = NULL,
+ .label = "sha256_32",
+ .desc = "sha256_32 digest transform",
+ .linkage = {NULL, NULL}
+ },
+ .fipsize = sizeof(__u32),
+ .alloc = alloc_sha256,
+ .free = free_sha256
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dir/dir.h linux-5.10.2/fs/reiser4/plugin/dir/dir.h
--- linux-5.10.2.orig/fs/reiser4/plugin/dir/dir.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dir/dir.h 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,38 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* this file contains declarations of methods implementing directory plugins */
+
+#if !defined( __REISER4_DIR_H__ )
+#define __REISER4_DIR_H__
+
+/*#include "../../key.h"
+
+#include <linux/fs.h>*/
+
+long reiser4_ioctl_dir_common(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
+
+/* "hashed" directory methods of dir plugin */
+void build_entry_key_hashed(const struct inode *, const struct qstr *,
+ reiser4_key *);
+
+/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
+
+/* "seekable" directory methods of dir plugin */
+void build_entry_key_seekable(const struct inode *, const struct qstr *,
+ reiser4_key *);
+
+/* __REISER4_DIR_H__ */
+#endif
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-5.10.2/fs/reiser4/plugin/dir/hashed_dir.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dir/hashed_dir.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dir/hashed_dir.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,81 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
+ names to the files. */
+
+/*
+ * Hashed directory logically consists of persistent directory
+ * entries. Directory entry is a pair of a file name and a key of stat-data of
+ * a file that has this name in the given directory.
+ *
+ * Directory entries are stored in the tree in the form of directory
+ * items. Directory item should implement dir_entry_ops portion of item plugin
+ * interface (see plugin/item/item.h). Hashed directory interacts with
+ * directory item plugin exclusively through dir_entry_ops operations.
+ *
+ * Currently there are two implementations of directory items: "simple
+ * directory item" (plugin/item/sde.[ch]), and "compound directory item"
+ * (plugin/item/cde.[ch]) with the latter being the default.
+ *
+ * There is, however some delicate way through which directory code interferes
+ * with item plugin: key assignment policy. A key for a directory item is
+ * chosen by directory code, and as described in kassign.c, this key contains
+ * a portion of file name. Directory item uses this knowledge to avoid storing
+ * this portion of file name twice: in the key and in the directory item body.
+ *
+ */
+
+#include "../../inode.h"
+
+void complete_entry_key(const struct inode *, const char *name,
+ int len, reiser4_key * result);
+
+/* this is implementation of build_entry_key method of dir
+ plugin for HASHED_DIR_PLUGIN_ID
+ */
+void build_entry_key_hashed(const struct inode *dir, /* directory where entry is
+ * (or will be) in.*/
+ const struct qstr *qname, /* name of file referenced
+ * by this entry */
+ reiser4_key * result /* resulting key of directory
+ * entry */ )
+{
+ const char *name;
+ int len;
+
+ assert("nikita-1139", dir != NULL);
+ assert("nikita-1140", qname != NULL);
+ assert("nikita-1141", qname->name != NULL);
+ assert("nikita-1142", result != NULL);
+
+ name = qname->name;
+ len = qname->len;
+
+ assert("nikita-2867", strlen(name) == len);
+
+ reiser4_key_init(result);
+ /* locality of directory entry's key is objectid of parent
+ directory */
+ set_key_locality(result, get_inode_oid(dir));
+ /* minor packing locality is constant */
+ set_key_type(result, KEY_FILE_NAME_MINOR);
+ /* dot is special case---we always want it to be first entry in
+ a directory. Actually, we just want to have smallest
+ directory entry.
+ */
+ if (len == 1 && name[0] == '.')
+ return;
+
+ /* initialize part of entry key which depends on file name */
+ complete_entry_key(dir, name, len, result);
+}
+
+/* Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dir/Makefile linux-5.10.2/fs/reiser4/plugin/dir/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/dir/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dir/Makefile 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,5 @@
+obj-$(CONFIG_REISER4_FS) += dir_plugins.o
+
+dir_plugins-objs := \
+ hashed_dir.o \
+ seekable_dir.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-5.10.2/fs/reiser4/plugin/dir/seekable_dir.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dir/seekable_dir.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dir/seekable_dir.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,46 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "../../inode.h"
+
+/* this is implementation of build_entry_key method of dir
+ plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
+ This is for directories where we want repeatable and restartable readdir()
+ even in case 32bit user level struct dirent (readdir(3)).
+*/
+void
+build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
+ reiser4_key * result)
+{
+ oid_t objectid;
+
+ assert("nikita-2283", dir != NULL);
+ assert("nikita-2284", name != NULL);
+ assert("nikita-2285", name->name != NULL);
+ assert("nikita-2286", result != NULL);
+
+ reiser4_key_init(result);
+ /* locality of directory entry's key is objectid of parent
+ directory */
+ set_key_locality(result, get_inode_oid(dir));
+ /* minor packing locality is constant */
+ set_key_type(result, KEY_FILE_NAME_MINOR);
+ /* dot is special case---we always want it to be first entry in
+ a directory. Actually, we just want to have smallest
+ directory entry.
+ */
+ if ((name->len == 1) && (name->name[0] == '.'))
+ return;
+
+ /* objectid of key is 31 lowest bits of hash. */
+ objectid =
+ inode_hash_plugin(dir)->hash(name->name,
+ (int)name->len) & 0x7fffffff;
+
+ assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
+ set_key_objectid(result, objectid);
+
+ /* offset is always 0. */
+ set_key_offset(result, (__u64) 0);
+ return;
+}
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dir_plugin_common.c linux-5.10.2/fs/reiser4/plugin/dir_plugin_common.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dir_plugin_common.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dir_plugin_common.c 2020-12-23 16:07:46.121813173 +0100
@@ -0,0 +1,869 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* this file contains typical implementations for most of methods of
+ directory plugin
+*/
+
+#include "../inode.h"
+
+int reiser4_find_entry(struct inode *dir, struct dentry *name,
+ lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *);
+int reiser4_lookup_name(struct inode *parent, struct dentry *dentry,
+ reiser4_key * key);
+void check_light_weight(struct inode *inode, struct inode *parent);
+
+/* this is common implementation of get_parent method of dir plugin
+ this is used by NFS kernel server to "climb" up directory tree to
+ check permissions
+ */
+struct dentry *get_parent_common(struct inode *child)
+{
+ struct super_block *s;
+ struct inode *parent;
+ struct dentry dotdot;
+ struct dentry *dentry;
+ reiser4_key key;
+ int result;
+
+ /*
+ * lookup dotdot entry.
+ */
+
+ s = child->i_sb;
+ memset(&dotdot, 0, sizeof(dotdot));
+ dotdot.d_name.name = "..";
+ dotdot.d_name.len = 2;
+ dotdot.d_op = &get_super_private(s)->ops.dentry;
+
+ result = reiser4_lookup_name(child, &dotdot, &key);
+ if (result != 0)
+ return ERR_PTR(result);
+
+ parent = reiser4_iget(s, &key, FIND_EXACT, 1);
+ if (!IS_ERR(parent)) {
+ /*
+ * FIXME-NIKITA dubious: attributes are inherited from @child
+ * to @parent. But:
+ *
+ * (*) this is the only this we can do
+ *
+ * (*) attributes of light-weight object are inherited
+ * from a parent through which object was looked up first,
+ * so it is ambiguous anyway.
+ *
+ */
+ check_light_weight(parent, child);
+ reiser4_iget_complete(parent);
+ dentry = d_obtain_alias(parent);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &get_super_private(s)->ops.dentry;
+ } else if (PTR_ERR(parent) == -ENOENT)
+ dentry = ERR_PTR(RETERR(-ESTALE));
+ else
+ dentry = (void *)parent;
+ return dentry;
+}
+
+/* this is common implementation of is_name_acceptable method of dir
+ plugin
+ */
+int is_name_acceptable_common(const struct inode *inode, /* directory to check*/
+ const char *name UNUSED_ARG, /* name to check */
+ int len/* @name's length */)
+{
+ assert("nikita-733", inode != NULL);
+ assert("nikita-734", name != NULL);
+ assert("nikita-735", len > 0);
+
+ return len <= reiser4_max_filename_len(inode);
+}
+
+/* there is no common implementation of build_entry_key method of dir
+ plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
+ plugin/dir/seekable.c:build_entry_key_seekable() for example
+*/
+
+/* this is common implementation of build_readdir_key method of dir
+ plugin
+ see reiser4_readdir_common for more details
+*/
+int build_readdir_key_common(struct file *dir /* directory being read */ ,
+ reiser4_key * result/* where to store key */)
+{
+ reiser4_file_fsdata *fdata;
+ struct inode *inode;
+
+ assert("nikita-1361", dir != NULL);
+ assert("nikita-1362", result != NULL);
+ assert("nikita-1363", dir->f_path.dentry != NULL);
+ inode = file_inode(dir);
+ assert("nikita-1373", inode != NULL);
+
+ fdata = reiser4_get_file_fsdata(dir);
+ if (IS_ERR(fdata))
+ return PTR_ERR(fdata);
+ assert("nikita-1364", fdata != NULL);
+ return extract_key_from_de_id(get_inode_oid(inode),
+ &fdata->dir.readdir.position.dir_entry_key,
+ result);
+}
+
+void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
+ int adj);
+
+/* this is common implementation of add_entry method of dir plugin
+*/
+int reiser4_add_entry_common(struct inode *object, /* directory to add new name
+ * in */
+ struct dentry *where, /* new name */
+ reiser4_object_create_data * data, /* parameters of
+ * new object */
+ reiser4_dir_entry_desc * entry /* parameters of
+ * new directory
+ * entry */)
+{
+ int result;
+ coord_t *coord;
+ lock_handle lh;
+ struct reiser4_dentry_fsdata *fsdata;
+ reiser4_block_nr reserve;
+
+ assert("nikita-1114", object != NULL);
+ assert("nikita-1250", where != NULL);
+
+ fsdata = reiser4_get_dentry_fsdata(where);
+ if (unlikely(IS_ERR(fsdata)))
+ return PTR_ERR(fsdata);
+
+ reserve = inode_dir_plugin(object)->estimate.add_entry(object);
+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT, get_meta_subvol()))
+ return RETERR(-ENOSPC);
+
+ init_lh(&lh);
+ coord = &fsdata->dec.entry_coord;
+ coord_clear_iplug(coord);
+
+ /* check for this entry in a directory. This is plugin method. */
+ result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
+ entry);
+ if (likely(result == -ENOENT)) {
+ /* add new entry. Just pass control to the directory
+ item plugin. */
+ assert("nikita-1709", inode_dir_item_plugin(object));
+ assert("nikita-2230", coord->node == lh.node);
+ reiser4_seal_done(&fsdata->dec.entry_seal);
+ result =
+ inode_dir_item_plugin(object)->s.dir.add_entry(object,
+ coord, &lh,
+ where,
+ entry);
+ if (result == 0) {
+ reiser4_adjust_dir_file(object, where,
+ fsdata->dec.pos + 1, +1);
+ INODE_INC_FIELD(object, i_size);
+ }
+ } else if (result == 0) {
+ assert("nikita-2232", coord->node == lh.node);
+ result = RETERR(-EEXIST);
+ }
+ done_lh(&lh);
+
+ return result;
+}
+
+/**
+ * rem_entry - remove entry from directory item
+ * @dir:
+ * @dentry:
+ * @entry:
+ * @coord:
+ * @lh:
+ *
+ * Checks that coordinate @coord is set properly and calls item plugin
+ * method to cut entry.
+ */
+static int
+rem_entry(struct inode *dir, struct dentry *dentry,
+ reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh)
+{
+ item_plugin *iplug;
+ struct inode *child;
+
+ iplug = inode_dir_item_plugin(dir);
+ child = dentry->d_inode;
+ assert("nikita-3399", child != NULL);
+
+ /* check that we are really destroying an entry for @child */
+ if (REISER4_DEBUG) {
+ int result;
+ reiser4_key key;
+
+ result = iplug->s.dir.extract_key(coord, &key);
+ if (result != 0)
+ return result;
+ if (get_key_objectid(&key) != get_inode_oid(child)) {
+ warning("nikita-3397",
+ "rem_entry: %#llx != %#llx\n",
+ get_key_objectid(&key),
+ (unsigned long long)get_inode_oid(child));
+ return RETERR(-EIO);
+ }
+ }
+ return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
+}
+
+/**
+ * reiser4_rem_entry_common - remove entry from a directory
+ * @dir: directory to remove entry from
+ * @where: name that is being removed
+ * @entry: description of entry being removed
+ *
+ * This is common implementation of rem_entry method of dir plugin.
+ */
+int reiser4_rem_entry_common(struct inode *dir,
+ struct dentry *dentry,
+ reiser4_dir_entry_desc * entry)
+{
+ int result;
+ coord_t *coord;
+ lock_handle lh;
+ struct reiser4_dentry_fsdata *fsdata;
+ __u64 tograb;
+
+ assert("nikita-1124", dir != NULL);
+ assert("nikita-1125", dentry != NULL);
+
+ tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
+ result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED,
+ get_meta_subvol());
+ if (result != 0)
+ return RETERR(-ENOSPC);
+
+ init_lh(&lh);
+
+ /* check for this entry in a directory. This is plugin method. */
+ result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
+ fsdata = reiser4_get_dentry_fsdata(dentry);
+ if (IS_ERR(fsdata)) {
+ done_lh(&lh);
+ return PTR_ERR(fsdata);
+ }
+
+ coord = &fsdata->dec.entry_coord;
+
+ assert("nikita-3404",
+ get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
+ dir->i_size <= 1);
+
+ coord_clear_iplug(coord);
+ if (result == 0) {
+ /* remove entry. Just pass control to the directory item
+ plugin. */
+ assert("vs-542", inode_dir_item_plugin(dir));
+ reiser4_seal_done(&fsdata->dec.entry_seal);
+ reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
+ result =
+ WITH_COORD(coord,
+ rem_entry(dir, dentry, entry, coord, &lh));
+ if (result == 0) {
+ if (dir->i_size >= 1)
+ INODE_DEC_FIELD(dir, i_size);
+ else {
+ warning("nikita-2509", "Dir %llu is runt",
+ (unsigned long long)
+ get_inode_oid(dir));
+ result = RETERR(-EIO);
+ }
+
+ assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
+ dentry->d_inode->i_size != 2 ||
+ inode_dir_plugin(dentry->d_inode) == NULL);
+ }
+ }
+ done_lh(&lh);
+
+ return result;
+}
+
+static reiser4_block_nr estimate_init(struct inode *parent,
+ struct inode *object);
+static int create_dot_dotdot(struct inode *object, struct inode *parent);
+
+/* this is common implementation of init method of dir plugin
+ create "." and ".." entries
+*/
+int reiser4_dir_init_common(struct inode *object, /* new directory */
+ struct inode *parent, /* parent directory */
+ reiser4_object_create_data * data /* info passed
+ * to us, this
+ * is filled by
+ * reiser4()
+ * syscall in
+ * particular */)
+{
+ reiser4_block_nr reserve;
+
+ assert("nikita-680", object != NULL);
+ assert("nikita-681", S_ISDIR(object->i_mode));
+ assert("nikita-682", parent != NULL);
+ assert("nikita-684", data != NULL);
+ assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
+ assert("nikita-687", object->i_mode & S_IFDIR);
+
+ reserve = estimate_init(parent, object);
+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT, get_meta_subvol()))
+ return RETERR(-ENOSPC);
+
+ return create_dot_dotdot(object, parent);
+}
+
+/* this is common implementation of done method of dir plugin
+ remove "." entry
+*/
+int reiser4_dir_done_common(struct inode *object/* object being deleted */)
+{
+ int result;
+ reiser4_block_nr reserve;
+ struct dentry goodby_dots;
+ reiser4_dir_entry_desc entry;
+
+ assert("nikita-1449", object != NULL);
+
+ if (reiser4_inode_get_flag(object, REISER4_NO_SD))
+ return 0;
+
+ /* of course, this can be rewritten to sweep everything in one
+ reiser4_cut_tree(). */
+ memset(&entry, 0, sizeof entry);
+
+ /* FIXME: this done method is called from reiser4_delete_dir_common
+ * which reserved space already */
+ reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED,
+ get_meta_subvol()))
+ return RETERR(-ENOSPC);
+
+ memset(&goodby_dots, 0, sizeof goodby_dots);
+ entry.obj = goodby_dots.d_inode = object;
+ goodby_dots.d_name.name = ".";
+ goodby_dots.d_name.len = 1;
+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
+ reiser4_free_dentry_fsdata(&goodby_dots);
+ if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
+ warning("nikita-2252", "Cannot remove dot of %lli: %i",
+ (unsigned long long)get_inode_oid(object), result);
+ return 0;
+}
+
+/* this is common implementation of attach method of dir plugin
+*/
+int reiser4_attach_common(struct inode *child UNUSED_ARG,
+ struct inode *parent UNUSED_ARG)
+{
+ assert("nikita-2647", child != NULL);
+ assert("nikita-2648", parent != NULL);
+
+ return 0;
+}
+
+/* this is common implementation of detach method of dir plugin
+ remove "..", decrease nlink on parent
+*/
+int reiser4_detach_common(struct inode *object, struct inode *parent)
+{
+ int result;
+ struct dentry goodby_dots;
+ reiser4_dir_entry_desc entry;
+
+ assert("nikita-2885", object != NULL);
+ assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
+
+ memset(&entry, 0, sizeof entry);
+
+ /* NOTE-NIKITA this only works if @parent is -the- parent of
+ @object, viz. object whose key is stored in dotdot
+ entry. Wouldn't work with hard-links on directories. */
+ memset(&goodby_dots, 0, sizeof goodby_dots);
+ entry.obj = goodby_dots.d_inode = parent;
+ goodby_dots.d_name.name = "..";
+ goodby_dots.d_name.len = 2;
+ result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
+ reiser4_free_dentry_fsdata(&goodby_dots);
+ if (result == 0) {
+ /* the dot should be the only entry remaining at this time... */
+ assert("nikita-3400",
+ object->i_size == 1 && object->i_nlink <= 2);
+#if 0
+ /* and, together with the only name directory can have, they
+ * provides for the last 2 remaining references. If we get
+ * here as part of error handling during mkdir, @object
+ * possibly has no name yet, so its nlink == 1. If we get here
+ * from rename (targeting empty directory), it has no name
+ * already, so its nlink == 1. */
+ assert("nikita-3401",
+ object->i_nlink == 2 || object->i_nlink == 1);
+#endif
+
+ /* decrement nlink of directory removed ".." pointed
+ to */
+ reiser4_del_nlink(parent, NULL, 0);
+ }
+ return result;
+}
+
+/* this is common implementation of estimate.add_entry method of
+ dir plugin
+ estimation of adding entry which supposes that entry is inserting a
+ unit into item
+*/
+reiser4_block_nr estimate_add_entry_common(const struct inode *inode)
+{
+ return estimate_one_insert_into_item(meta_subvol_tree());
+}
+
+/* this is common implementation of estimate.rem_entry method of dir
+ plugin
+*/
+reiser4_block_nr estimate_rem_entry_common(const struct inode *inode)
+{
+ return estimate_one_item_removal(meta_subvol_tree());
+}
+
+/* this is common implementation of estimate.unlink method of dir
+ plugin
+*/
+reiser4_block_nr
+dir_estimate_unlink_common(const struct inode *parent,
+ const struct inode *object)
+{
+ reiser4_block_nr res;
+
+ /* hashed_rem_entry(object) */
+ res = inode_dir_plugin(object)->estimate.rem_entry(object);
+ /* del_nlink(parent) */
+ res += 2 * inode_file_plugin(parent)->estimate.update(parent);
+
+ return res;
+}
+
+/*
+ * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
+ * methods: if @inode is a light-weight file, setup its credentials
+ * that are not stored in the stat-data in this case
+ */
+void check_light_weight(struct inode *inode, struct inode *parent)
+{
+ if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
+ inode->i_uid = parent->i_uid;
+ inode->i_gid = parent->i_gid;
+ /* clear light-weight flag. If inode would be read by any
+ other name, [ug]id wouldn't change. */
+ reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
+ }
+}
+
+/* looks for name specified in @dentry in directory @parent and if name is
+ found - key of object found entry points to is stored in @entry->key */
+int reiser4_lookup_name(struct inode *parent, /* inode of directory to lookup
+ * for name in */
+ struct dentry *dentry, /* name to look for */
+ reiser4_key * key/* place to store key */)
+{
+ int result;
+ coord_t *coord;
+ lock_handle lh;
+ const char *name;
+ int len;
+ reiser4_dir_entry_desc entry;
+ struct reiser4_dentry_fsdata *fsdata;
+
+ assert("nikita-1247", parent != NULL);
+ assert("nikita-1248", dentry != NULL);
+ assert("nikita-1123", dentry->d_name.name != NULL);
+ assert("vs-1486",
+ dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
+
+ name = dentry->d_name.name;
+ len = dentry->d_name.len;
+
+ if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
+ /* some arbitrary error code to return */
+ return RETERR(-ENAMETOOLONG);
+
+ fsdata = reiser4_get_dentry_fsdata(dentry);
+ if (IS_ERR(fsdata))
+ return PTR_ERR(fsdata);
+
+ coord = &fsdata->dec.entry_coord;
+ coord_clear_iplug(coord);
+ init_lh(&lh);
+
+ /* find entry in a directory. This is plugin method. */
+ result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
+ &entry);
+ if (result == 0) {
+ /* entry was found, extract object key from it. */
+ result =
+ WITH_COORD(coord,
+ item_plugin_by_coord(coord)->s.dir.
+ extract_key(coord, key));
+ }
+ done_lh(&lh);
+ return result;
+
+}
+
+/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
+static reiser4_block_nr
+estimate_init(struct inode *parent, struct inode *object)
+{
+ reiser4_block_nr res = 0;
+
+ assert("vpf-321", parent != NULL);
+ assert("vpf-322", object != NULL);
+
+ /* hashed_add_entry(object) */
+ res += inode_dir_plugin(object)->estimate.add_entry(object);
+ /* reiser4_add_nlink(object) */
+ res += inode_file_plugin(object)->estimate.update(object);
+ /* hashed_add_entry(object) */
+ res += inode_dir_plugin(object)->estimate.add_entry(object);
+ /* reiser4_add_nlink(parent) */
+ res += inode_file_plugin(parent)->estimate.update(parent);
+
+ return 0;
+}
+
+/* helper function for reiser4_dir_init_common(). Create "." and ".." */
+static int create_dot_dotdot(struct inode *object/* object to create dot and
+ * dotdot for */ ,
+ struct inode *parent/* parent of @object */)
+{
+ int result;
+ struct dentry dots_entry;
+ reiser4_dir_entry_desc entry;
+
+ assert("nikita-688", object != NULL);
+ assert("nikita-689", S_ISDIR(object->i_mode));
+ assert("nikita-691", parent != NULL);
+
+ /* We store dot and dotdot as normal directory entries. This is
+ not necessary, because almost all information stored in them
+ is already in the stat-data of directory, the only thing
+ being missed is objectid of grand-parent directory that can
+ easily be added there as extension.
+
+ But it is done the way it is done, because not storing dot
+ and dotdot will lead to the following complications:
+
+ . special case handling in ->lookup().
+ . addition of another extension to the sd.
+ . dependency on key allocation policy for stat data.
+
+ */
+
+ memset(&entry, 0, sizeof entry);
+ memset(&dots_entry, 0, sizeof dots_entry);
+ entry.obj = dots_entry.d_inode = object;
+ dots_entry.d_name.name = ".";
+ dots_entry.d_name.len = 1;
+ result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
+ reiser4_free_dentry_fsdata(&dots_entry);
+
+ if (result == 0) {
+ result = reiser4_add_nlink(object, object, 0);
+ if (result == 0) {
+ entry.obj = dots_entry.d_inode = parent;
+ dots_entry.d_name.name = "..";
+ dots_entry.d_name.len = 2;
+ result = reiser4_add_entry_common(object,
+ &dots_entry, NULL, &entry);
+ reiser4_free_dentry_fsdata(&dots_entry);
+ /* if creation of ".." failed, iput() will delete
+ object with ".". */
+ if (result == 0) {
+ result = reiser4_add_nlink(parent, object, 0);
+ if (result != 0)
+ /*
+ * if we failed to bump i_nlink, try
+ * to remove ".."
+ */
+ reiser4_detach_common(object, parent);
+ }
+ }
+ }
+
+ if (result != 0) {
+ /*
+ * in the case of error, at least update stat-data so that,
+ * ->i_nlink updates are not lingering.
+ */
+ reiser4_update_sd(object);
+ reiser4_update_sd(parent);
+ }
+
+ return result;
+}
+
+/*
+ * return 0 iff @coord contains a directory entry for the file with the name
+ * @name.
+ */
+static int
+check_item(const struct inode *dir, const coord_t *coord, const char *name)
+{
+ item_plugin *iplug;
+ char buf[DE_NAME_BUF_LEN];
+
+ iplug = item_plugin_by_coord(coord);
+ if (iplug == NULL) {
+ warning("nikita-1135", "Cannot get item plugin");
+ print_coord("coord", coord, 1);
+ return RETERR(-EIO);
+ } else if (item_id_by_coord(coord) !=
+ item_id_by_plugin(inode_dir_item_plugin(dir))) {
+ /* item id of current item does not match to id of items a
+ directory is built of */
+ warning("nikita-1136", "Wrong item plugin");
+ print_coord("coord", coord, 1);
+ return RETERR(-EIO);
+ }
+ assert("nikita-1137", iplug->s.dir.extract_name);
+
+ /* Compare name stored in this entry with name we are looking for.
+
+ NOTE-NIKITA Here should go code for support of something like
+ unicode, code tables, etc.
+ */
+ return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
+}
+
+static int
+check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name)
+{
+ return WITH_COORD(coord, check_item(dir, coord, name->name));
+}
+
+/*
+ * argument package used by entry_actor to scan entries with identical keys.
+ */
+struct entry_actor_args {
+ /* name we are looking for */
+ const char *name;
+ /* key of directory entry. entry_actor() scans through sequence of
+ * items/units having the same key */
+ reiser4_key *key;
+ /* how many entries with duplicate key was scanned so far. */
+ int non_uniq;
+#if REISER4_USE_COLLISION_LIMIT
+ /* scan limit */
+ int max_non_uniq;
+#endif
+ /* return parameter: set to true, if ->name wasn't found */
+ int not_found;
+ /* what type of lock to take when moving to the next node during
+ * scan */
+ znode_lock_mode mode;
+
+ /* last coord that was visited during scan */
+ coord_t last_coord;
+ /* last node locked during scan */
+ lock_handle last_lh;
+ /* inode of directory */
+ const struct inode *inode;
+};
+
+/* Function called by reiser4_find_entry() to look for given name
+ in the directory. */
+static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
+ coord_t *coord /* current coord */ ,
+ lock_handle * lh /* current lock handle */ ,
+ void *entry_actor_arg/* argument to scan */)
+{
+ reiser4_key unit_key;
+ struct entry_actor_args *args;
+
+ assert("nikita-1131", tree != NULL);
+ assert("nikita-1132", coord != NULL);
+ assert("nikita-1133", entry_actor_arg != NULL);
+
+ args = entry_actor_arg;
+ ++args->non_uniq;
+#if REISER4_USE_COLLISION_LIMIT
+ if (args->non_uniq > args->max_non_uniq) {
+ args->not_found = 1;
+ /* hash collision overflow. */
+ return RETERR(-EBUSY);
+ }
+#endif
+
+ /*
+ * did we just reach the end of the sequence of items/units with
+ * identical keys?
+ */
+ if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
+ assert("nikita-1791",
+ keylt(args->key, unit_key_by_coord(coord, &unit_key)));
+ args->not_found = 1;
+ args->last_coord.between = AFTER_UNIT;
+ return 0;
+ }
+
+ coord_dup(&args->last_coord, coord);
+ /*
+ * did scan just moved to the next node?
+ */
+ if (args->last_lh.node != lh->node) {
+ int lock_result;
+
+ /*
+ * if so, lock new node with the mode requested by the caller
+ */
+ done_lh(&args->last_lh);
+ assert("nikita-1896", znode_is_any_locked(lh->node));
+ lock_result = longterm_lock_znode(&args->last_lh, lh->node,
+ args->mode, ZNODE_LOCK_HIPRI);
+ if (lock_result != 0)
+ return lock_result;
+ }
+ return check_item(args->inode, coord, args->name);
+}
+
+/* Look for given @name within directory @dir.
+
+ This is called during lookup, creation and removal of directory
+ entries and on reiser4_rename_common
+
+ First calculate key that directory entry for @name would have. Search
+ for this key in the tree. If such key is found, scan all items with
+ the same key, checking name in each directory entry along the way.
+*/
+int reiser4_find_entry(struct inode *dir, /* directory to scan */
+ struct dentry *de, /* name to search for */
+ lock_handle * lh, /* resulting lock handle */
+ znode_lock_mode mode, /* required lock mode */
+ reiser4_dir_entry_desc * entry /* parameters of found
+ directory entry */)
+{
+ const struct qstr *name;
+ seal_t *seal;
+ coord_t *coord;
+ int result;
+ __u32 flags;
+ struct de_location *dec;
+ struct reiser4_dentry_fsdata *fsdata;
+
+ assert("nikita-1130", lh != NULL);
+ assert("nikita-1128", dir != NULL);
+
+ name = &de->d_name;
+ assert("nikita-1129", name != NULL);
+
+ /* dentry private data don't require lock, because dentry
+ manipulations are protected by i_mutex on parent.
+
+ This is not so for inodes, because there is no -the- parent in
+ inode case.
+ */
+ fsdata = reiser4_get_dentry_fsdata(de);
+ if (IS_ERR(fsdata))
+ return PTR_ERR(fsdata);
+ dec = &fsdata->dec;
+
+ coord = &dec->entry_coord;
+ coord_clear_iplug(coord);
+ seal = &dec->entry_seal;
+ /* compose key of directory entry for @name */
+ inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
+
+ if (reiser4_seal_is_set(seal)) {
+ /* check seal */
+ result = reiser4_seal_validate(seal,
+ meta_subvol_tree(),
+ coord, &entry->key,
+ lh, mode, ZNODE_LOCK_LOPRI);
+ if (result == 0) {
+ /* key was found. Check that it is really item we are
+ looking for. */
+ result = check_entry(dir, coord, name);
+ if (result == 0)
+ return 0;
+ }
+ }
+ flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
+ /*
+ * find place in the tree where directory item should be located.
+ */
+ result = reiser4_object_lookup(meta_subvol_tree(),
+ dir, &entry->key, coord, lh, mode,
+ FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
+ flags, NULL/*ra_info */);
+ if (result == CBK_COORD_FOUND) {
+ struct entry_actor_args arg;
+
+ /* fast path: no hash collisions */
+ result = check_entry(dir, coord, name);
+ if (result == 0) {
+ reiser4_seal_init(seal, coord, &entry->key);
+ dec->pos = 0;
+ } else if (result > 0) {
+ /* Iterate through all units with the same keys. */
+ arg.name = name->name;
+ arg.key = &entry->key;
+ arg.not_found = 0;
+ arg.non_uniq = 0;
+#if REISER4_USE_COLLISION_LIMIT
+ arg.max_non_uniq = max_hash_collisions(dir);
+ assert("nikita-2851", arg.max_non_uniq > 1);
+#endif
+ arg.mode = mode;
+ arg.inode = dir;
+ coord_init_zero(&arg.last_coord);
+ init_lh(&arg.last_lh);
+
+ result = reiser4_iterate_tree(meta_subvol_tree(),
+ coord, lh,
+ entry_actor,
+ &arg, mode, 1);
+ /* if end of the tree or extent was reached during
+ scanning. */
+ if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
+ /* step back */
+ done_lh(lh);
+
+ result = zload(arg.last_coord.node);
+ if (result == 0) {
+ coord_clear_iplug(&arg.last_coord);
+ coord_dup(coord, &arg.last_coord);
+ move_lh(lh, &arg.last_lh);
+ result = RETERR(-ENOENT);
+ zrelse(arg.last_coord.node);
+ --arg.non_uniq;
+ }
+ }
+
+ done_lh(&arg.last_lh);
+ if (result == 0)
+ reiser4_seal_init(seal, coord, &entry->key);
+
+ if (result == 0 || result == -ENOENT) {
+ assert("nikita-2580", arg.non_uniq > 0);
+ dec->pos = arg.non_uniq - 1;
+ }
+ }
+ } else
+ dec->pos = -1;
+ return result;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format40.c
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format40.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format40.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,884 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../key.h"
+#include "../node/node.h"
+#include "../space/space_allocator.h"
+#include "disk_format40.h"
+#include "../plugin.h"
+#include "../../txnmgr.h"
+#include "../../jnode.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../../plugin/volume/volume.h"
+#include "../../wander.h"
+#include "../../inode.h"
+#include "../../ktxnmgrd.h"
+#include "../../status_flags.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+
+/*
+ * Methods of standard disk layout for simple volumes (i.e. volumes
+ * associated with a single physical, or logical (RAID, LVM) device.
+ */
+
+/*
+ * Amount of free blocks needed to perform release_format40 when fs gets
+ * mounted RW:
+ * 1 for SB,
+ * 1 for non-leaves in overwrite set,
+ * 2 for tx header & tx record
+ */
+#define RELEASE_RESERVED 4
+
+/*
+ * This flag indicates that backup should be updated by fsck
+ */
+#define FORMAT40_UPDATE_BACKUP (1 << 31)
+
+/*
+ * Functions to access fields of format40_disk_super_block
+ */
+static __u64 get_format40_block_count(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->block_count));
+}
+
+static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->free_blocks));
+}
+
+static __u64 get_format40_root_block(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->root_block));
+}
+
+static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
+{
+ return le16_to_cpu(get_unaligned(&sb->tree_height));
+}
+
+static __u64 get_format40_file_count(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->file_count));
+}
+
+static __u64 get_format40_oid(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->oid));
+}
+
+static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
+{
+ return le32_to_cpu(get_unaligned(&sb->mkfs_id));
+}
+
+static __u32 get_format40_node_plugin_id(const format40_disk_super_block * sb)
+{
+ return le32_to_cpu(get_unaligned(&sb->node_pid));
+}
+
+static __u64 get_format40_flags(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->flags));
+}
+
+static __u64 get_format40_origin_id(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->origin_id));
+}
+
+static __u64 get_format40_nr_origins(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->nr_origins));
+}
+
+static int get_format40_num_sgs_bits(const format40_disk_super_block * sb)
+{
+ return sb->num_sgs_bits;
+}
+
+static __u64 get_format40_data_capacity(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->data_capacity));
+}
+
+static __u64 get_format40_volinfo_loc(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->volinfo_loc));
+}
+
+static __u64 get_format40_nr_mslots(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->nr_mslots));
+}
+
+static __u64 get_format40_min_occup(const format40_disk_super_block * sb)
+{
+ return le64_to_cpu(get_unaligned(&sb->min_occup));
+}
+
+static __u32 format40_get_minor_version_nr(const format40_disk_super_block * sb)
+{
+ return le32_to_cpu(get_unaligned(&sb->version)) &
+ ~FORMAT40_UPDATE_BACKUP;
+}
+
+static int update_backup_version(const format40_disk_super_block * sb)
+{
+ return (le32_to_cpu(get_unaligned(&sb->version)) &
+ FORMAT40_UPDATE_BACKUP);
+}
+
+static int update_disk_version_minor(const format40_disk_super_block * sb)
+{
+ return format40_get_minor_version_nr(sb) < get_release_number_minor();
+}
+
+static int incomplete_compatibility(const format40_disk_super_block * sb)
+{
+ return format40_get_minor_version_nr(sb) > get_release_number_minor();
+}
+
+static int get_sb_format_jnode(reiser4_subvol *subv)
+{
+ int ret;
+ jnode *sb_jnode;
+
+ sb_jnode = reiser4_alloc_io_head(&subv->loc_super, subv);
+
+ ret = jload(sb_jnode);
+
+ if (ret) {
+ reiser4_drop_io_head(sb_jnode);
+ return ret;
+ }
+ pin_jnode_data(sb_jnode);
+ jrelse(sb_jnode);
+
+ subv->sb_jnode = sb_jnode;
+
+ return 0;
+}
+
+static void put_sb_format_jnode(reiser4_subvol *subv)
+{
+ if (subv->sb_jnode) {
+ unpin_jnode_data(subv->sb_jnode);
+ reiser4_drop_io_head(subv->sb_jnode);
+ subv->sb_jnode = NULL;
+ }
+}
+
+typedef enum format40_init_stage {
+ NONE_DONE = 0,
+ CONSULT_DISKMAP,
+ FIND_A_SUPER,
+ INIT_JOURNAL_INFO,
+ INIT_STATUS,
+ JOURNAL_REPLAY,
+ READ_SUPER,
+ KEY_CHECK,
+ INIT_OID,
+ INIT_TREE,
+ JOURNAL_RECOVER,
+ INIT_SA,
+ INIT_JNODE,
+ INIT_SYSTAB,
+ ALL_DONE
+} format40_init_stage;
+
+static int check_key_format(const format40_disk_super_block *sb_copy)
+{
+ if (!equi(REISER4_LARGE_KEY,
+ get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
+ warning("nikita-3228", "Key format mismatch. "
+ "Only %s keys are supported.",
+ REISER4_LARGE_KEY ? "large" : "small");
+ return RETERR(-EINVAL);
+ }
+ if (!equi(REISER4_PLANB_KEY_ALLOCATION,
+ get_format40_flags(sb_copy) & (1 << FORMAT40_PLANB_KEY_ALLOC))) {
+ warning("edward-2311", "Key allocation scheme mismatch. "
+ "Only %s key allocation is supported.",
+ REISER4_PLANB_KEY_ALLOCATION ? "Plan-B" : "Plan-A");
+ return RETERR(-EINVAL);
+ }
+ return 0;
+}
+
+/**
+ * Read on-disk system parameters, which define volume configuration.
+ * Perform sanity check.
+ */
+int read_check_volume_params(reiser4_subvol *subv,
+ format40_disk_super_block *sb_format)
+{
+ reiser4_volume *vol;
+
+ if (subvol_is_set(subv, SUBVOL_IS_ORPHAN)) {
+ /*
+ * Don't check parameters of new brick
+ * as they are invalid (to be set later).
+ * Set invalid brick ID to not confuse
+ * the new brick with meta-data brick
+ */
+ subv->id = INVALID_SUBVOL_ID;
+ return 0;
+ }
+ vol = super_volume(subv->super);
+
+ if (is_meta_brick_id(subv->id)) {
+ u32 nr_mslots;
+ u32 nr_origins;
+
+ nr_origins = get_format40_nr_origins(sb_format);
+ if (nr_origins == 0)
+ /*
+ * This is a subvolume of format 4.0.Y
+ * We handle this special case for backward
+ * compatibility - guess number of subvolumes
+ */
+ nr_origins = 1;
+ atomic_set(&vol->nr_origins, nr_origins);
+ vol->num_sgs_bits = get_format40_num_sgs_bits(sb_format);
+
+ nr_mslots = get_format40_nr_mslots(sb_format);
+ if (nr_mslots == 0) {
+ /* ditto - guess number of mslots */
+ assert("edward-2228", nr_origins == 1);
+ nr_mslots = 1;
+ }
+ if (!vol->conf) {
+ vol->conf = alloc_lv_conf(nr_mslots);
+ if (!vol->conf)
+ return -ENOMEM;
+ } else if (vol->conf != NULL && nr_mslots > 1) {
+ /*
+ * This it temporary config created
+ * for meta-data brick activation.
+ * Replace it with actual one.
+ */
+ lv_conf *new_conf;
+ int meta_subv_id;
+
+ meta_subv_id = vol->vol_plug->meta_subvol_id();
+
+ assert("edward-2304", vol->conf->nr_mslots == 1);
+ assert("edward-2305",
+ vol->conf->mslots[meta_subv_id][1] != NULL);
+ assert("edward-2306", vol->conf->tab == NULL);
+
+ new_conf = alloc_lv_conf(nr_mslots);
+ if (!new_conf)
+ return -ENOMEM;
+ /*
+ * copy actual info from temporary config
+ * to the new one
+ */
+ new_conf->mslots[meta_subv_id] =
+ vol->conf->mslots[meta_subv_id];
+ free_lv_conf(vol->conf);
+ vol->conf = new_conf;
+ }
+ }
+ assert("edward-2307", vol->conf != NULL);
+ if (subv->id >= vol->conf->nr_mslots) {
+ warning("edward-2308",
+ "brick %s (ID %llu) is inappropriate: too few mslots (%llu)",
+ subv->name, subv->id, vol->conf->nr_mslots);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * Find disk format super block at specified location. Note that it
+ * may be not the most recent version in the case of calling before
+ * journal replay. In this case the caller should have a guarantee
+ * that needed data are really actual.
+ * Perform checks and initialisations in accordance with format40
+ * specifications.
+ *
+ * Pre-condition: @super contains valid block size
+ */
+static int find_format40(reiser4_subvol *subv,
+ format40_disk_super_block *disk_sb)
+{
+ int ret;
+ struct page *page;
+ reiser4_volume *vol;
+
+ assert("edward-1788", subv != NULL);
+ assert("edward-1789", subv->super != NULL);
+
+ vol = super_volume(subv->super);
+
+ page = read_cache_page_gfp(subv->bdev->bd_inode->i_mapping,
+ subv->loc_super,
+ GFP_NOFS);
+ if (IS_ERR_OR_NULL(page))
+ return RETERR(-EIO);
+
+ memcpy(disk_sb, kmap(page), sizeof (*disk_sb));
+ kunmap(page);
+ put_page(page);
+ if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC)))
+ /*
+ * there is no reiser4 on this device
+ */
+ return RETERR(-EINVAL);
+ ret = read_check_volume_params(subv, disk_sb);
+ if (ret)
+ return RETERR(-EINVAL);
+ reiser4_subvol_set_block_count(subv,
+ get_format40_block_count(disk_sb));
+ reiser4_subvol_set_free_blocks(subv,
+ get_format40_free_blocks(disk_sb));
+ /*
+ * Set number of used blocks. The number of used blocks is stored
+ * neither in on-disk super block nor in the journal footer blocks.
+ * Instead we maintain it along with actual values of total blocks
+ * and free block counters in the in-memory subvolume header
+ */
+ reiser4_subvol_set_used_blocks(subv,
+ reiser4_subvol_block_count(subv) -
+ reiser4_subvol_free_blocks(subv));
+ return 0;
+}
+
+int extract_subvol_id_format40(struct block_device *bdev, u64 *subv_id)
+{
+ *subv_id = 0;
+ return 0;
+}
+
+/**
+ * Read disk forat super-block, retrieve internal subvolume ID
+ * and store it in @subv_id
+ */
+int extract_subvol_id_format41(struct block_device *bdev, u64 *subv_id)
+{
+ struct page *page;
+ format40_disk_super_block *format_sb;
+
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ FORMAT40_OFFSET >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (IS_ERR_OR_NULL(page))
+ return RETERR(-EIO);
+
+ format_sb = kmap(page);
+ if (strncmp(format_sb->magic,
+ FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
+ /*
+ * format40 not found
+ */
+ kunmap(page);
+ put_page(page);
+ return RETERR(-EINVAL);
+ }
+ *subv_id = get_format40_origin_id(format_sb);
+ kunmap(page);
+ put_page(page);
+ return 0;
+}
+
+/**
+ * Initialize in-memory subvolume header.
+ * Pre-condition: we are sure that subvolume is managed by expected
+ * disk format plugin(that is, format superblock with correct magic
+ * was found).
+ */
+static int try_init_format(struct super_block *super,
+ format40_init_stage *stage,
+ reiser4_subvol *subv, int major_version_nr)
+{
+ int result;
+ format40_disk_super_block sb_format;
+ tree_level height;
+ reiser4_block_nr root_block;
+ node_plugin *nplug;
+ u64 extended_status;
+ reiser4_volume *vol;
+
+ assert("vs-475", super != NULL);
+ assert("vs-474", get_super_private(super) != NULL);
+ assert("edward-1790", get_super_private(super)->vol != NULL);
+ assert("edward-1791", !is_replica(subv));
+
+ vol = get_super_private(super)->vol;
+
+ *stage = NONE_DONE;
+
+ subv->jloc.footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
+ subv->jloc.header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
+ subv->loc_super = FORMAT40_OFFSET / subv->super->s_blocksize;
+
+ result = reiser4_init_journal_info(subv);
+ if (result)
+ return result;
+ *stage = INIT_JOURNAL_INFO;
+
+ result = reiser4_status_init(subv, FORMAT40_STATUS_BLOCKNR);
+ if (result != 0 && result != -EINVAL)
+ /*
+ * -EINVAL means there is no magic, so probably just old fs
+ */
+ return result;
+ *stage = INIT_STATUS;
+
+ result = reiser4_status_query(subv, NULL, &extended_status);
+ if (result == REISER4_STATUS_MOUNT_WARN)
+ warning("vpf-1363", "Mounting %s with errors.",
+ super->s_id);
+
+ if (result == REISER4_STATUS_MOUNT_RO) {
+ warning("vpf-1364", "Mounting %s with fatal errors. "
+ "Forcing read-only mount.", super->s_id);
+ super->s_flags |= SB_RDONLY;
+ }
+ if (has_replicas(subv) &&
+ extended_status == REISER4_ESTATUS_MIRRORS_NOT_SYNCED) {
+ warning("edward-1792",
+ "Mounting %s with not synced mirrors. "
+ "Forcing read-only mount.", super->s_id);
+ super->s_flags |= SB_RDONLY;
+ }
+ /*
+ * Start form journal replay to make sure we are dealing
+ * with actual (most recent) data. All replicas will get
+ * respective update.
+ */
+ result = reiser4_journal_replay(subv);
+ if (result)
+ return result;
+ *stage = JOURNAL_REPLAY;
+ /*
+ * Now read the most recent version of format superblock
+ * after journal replay
+ */
+ result = find_format40(subv, &sb_format);
+ if (result)
+ return result;
+ *stage = READ_SUPER;
+
+ printk("reiser4 (%s): found disk format %d.%d.%d.\n",
+ super->s_id,
+ get_format_number_principal(major_version_nr),
+ major_version_nr,
+ format40_get_minor_version_nr(&sb_format));
+
+ if (incomplete_compatibility(&sb_format))
+ printk("reiser4 (%s): format version number (%d.%d.%d) is "
+ "greater than release number (%d.%d.%d) of reiser4 "
+ "kernel module. Some objects of the subvolume can "
+ "be inaccessible.\n",
+ super->s_id,
+ get_format_number_principal(major_version_nr),
+ major_version_nr,
+ format40_get_minor_version_nr(&sb_format),
+ get_release_number_principal(),
+ get_release_number_major(),
+ get_release_number_minor());
+ /*
+ * make sure that key format of kernel and filesystem match
+ */
+ result = check_key_format(&sb_format);
+ if (result)
+ return result;
+
+ *stage = KEY_CHECK;
+ /*
+ * convert on-disk flags to on-line state
+ */
+ if (get_format40_flags(&sb_format) & (1 << FORMAT40_HAS_DATA_ROOM))
+ subv->flags |= (1 << SUBVOL_HAS_DATA_ROOM);
+
+ if (get_format40_flags(&sb_format) & (1 << FORMAT40_TO_BE_REMOVED)) {
+ subv->flags |= (1 << SUBVOL_TO_BE_REMOVED);
+ reiser4_volume_set_incomplete_removal(super);
+ }
+
+ if (get_format40_flags(&sb_format) & (1 << FORMAT40_IS_PROXY)) {
+ subv->flags |= (1 << SUBVOL_IS_PROXY);
+ if (reiser4_is_set(super, REISER4_PROXY_ENABLED)) {
+ warning("edward-2430",
+ "Found second proxy subvolume %s", subv->name);
+ return -EINVAL;
+ }
+ reiser4_volume_set_proxy_enabled(super);
+ reiser4_volume_set_proxy_io(super);
+ }
+ if (is_meta_brick_id(subv->id)) {
+ result = oid_init_allocator(super,
+ get_format40_file_count(&sb_format),
+ get_format40_oid(&sb_format));
+ if (result)
+ return result;
+
+ if (get_format40_flags(&sb_format) & (1 << FORMAT40_UNBALANCED_VOLUME))
+ reiser4_volume_set_unbalanced(super);
+ }
+ *stage = INIT_OID;
+
+ root_block = get_format40_root_block(&sb_format);
+ height = get_format40_tree_height(&sb_format);
+ nplug = node_plugin_by_id(get_format40_node_plugin_id(&sb_format));
+ /*
+ * initialize storage tree.
+ */
+ result = reiser4_subvol_init_tree(subv, &root_block, height, nplug);
+ if (result)
+ return result;
+ *stage = INIT_TREE;
+ /*
+ * set private subvolume parameters
+ */
+ subv->mkfs_id = get_format40_mkfs_id(&sb_format);
+ subv->version = format40_get_minor_version_nr(&sb_format);
+ subv->blocks_free_committed = subv->blocks_free;
+
+ subv->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
+ subv->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
+ subv->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
+ subv->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
+
+ if (update_backup_version(&sb_format))
+ printk("reiser4: %s: use 'fsck.reiser4 --fix' "
+ "to complete disk format upgrade.\n", super->s_id);
+ /*
+ * all formatted nodes in a subvolume managed by format40
+ * are of one plugin
+ */
+ subv->flags |= (1 << SUBVOL_ONE_NODE_PLUGIN);
+ /*
+ * Recover sb data which were logged separately from sb block
+ * NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
+ * oid_init_allocator() and reiser4_set_free_blocks() with new
+ * data. What's the reason to call them above?
+ */
+ result = reiser4_journal_recover_sb_data(super, subv);
+ if (result)
+ return result;
+ *stage = JOURNAL_RECOVER;
+ /*
+ * recover_sb_data() sets actual data of free blocks,
+ * So we need to update the number of used blocks.
+ */
+ reiser4_subvol_set_used_blocks(subv,
+ reiser4_subvol_block_count(subv) -
+ reiser4_subvol_free_blocks(subv));
+ reiser4_subvol_set_min_blocks_used(subv,
+ get_format40_min_occup(&sb_format));
+ /*
+ * init disk space allocator
+ */
+ result = sa_init_allocator(&subv->space_allocator, super, subv, NULL);
+ if (result)
+ return result;
+ *stage = INIT_SA;
+
+ result = get_sb_format_jnode(subv);
+ if (result)
+ return result;
+ *stage = INIT_JNODE;
+
+ reiser4_subvol_set_data_capacity(subv,
+ get_format40_data_capacity(&sb_format));
+ /*
+ * load addresses of volume configs
+ */
+ subv->volmap_loc[CUR_VOL_CONF] = get_format40_volinfo_loc(&sb_format);
+
+ if (vol->vol_plug->load_volume) {
+ result = vol->vol_plug->load_volume(subv);
+ if (result)
+ return result;
+ }
+ *stage = ALL_DONE;
+
+ printk("reiser4 (%s): using %s.\n", subv->name,
+ txmod_plugin_by_id(subv->txmod)->h.desc);
+ return 0;
+}
+
+static int init_format_generic(struct super_block *s,
+ reiser4_subvol *subv, int version)
+{
+ int result;
+ format40_init_stage stage;
+ reiser4_volume *vol;
+
+ vol = get_super_private(s)->vol;
+
+ result = try_init_format(s, &stage, subv, version);
+ switch (stage) {
+ case ALL_DONE:
+ assert("nikita-3458", result == 0);
+ break;
+ case INIT_SYSTAB:
+ case INIT_JNODE:
+ put_sb_format_jnode(subv);
+ /* fall through */
+ case INIT_SA:
+ sa_destroy_allocator(reiser4_get_space_allocator(subv),
+ s, subv);
+ /* fall through */
+ case JOURNAL_RECOVER:
+ case INIT_TREE:
+ reiser4_done_tree(&subv->tree);
+ /* fall through */
+ case INIT_OID:
+ case KEY_CHECK:
+ case READ_SUPER:
+ if (!sb_rdonly(s) &&
+ reiser4_subvol_free_blocks(subv) < RELEASE_RESERVED)
+ result = RETERR(-ENOSPC);
+ /* fall through */
+ case JOURNAL_REPLAY:
+ case INIT_STATUS:
+ reiser4_status_finish(subv);
+ /* fall through */
+ case INIT_JOURNAL_INFO:
+ reiser4_done_journal_info(subv);
+ /* fall through */
+ case NONE_DONE:
+ break;
+ default:
+ impossible("nikita-3457", "init stage: %i", stage);
+ }
+ return result;
+}
+
+int init_format_format40(struct super_block *s, reiser4_subvol *subv)
+{
+ return init_format_generic(s, subv, 0 /* version */);
+}
+
+int init_format_format41(struct super_block *s, reiser4_subvol *subv)
+{
+ return init_format_generic(s, subv, 1 /* version */);
+}
+
+static void pack_format40_super(const struct super_block *s,
+ reiser4_subvol *subv, char *data)
+{
+ format40_disk_super_block *format_sb =
+ (format40_disk_super_block *) data;
+ reiser4_volume *vol = super_volume(s);
+ lv_conf *conf = vol->conf;
+ u64 format_flags = get_format40_flags(format_sb);
+
+ assert("zam-591", data != NULL);
+
+ put_unaligned(cpu_to_le64(reiser4_subvol_free_committed_blocks(subv)),
+ &format_sb->free_blocks);
+
+ put_unaligned(cpu_to_le64(subv->tree.root_block),
+ &format_sb->root_block);
+
+ put_unaligned(cpu_to_le64(oid_next(s)), &format_sb->oid);
+
+ put_unaligned(cpu_to_le64(oids_used(s)), &format_sb->file_count);
+
+ put_unaligned(cpu_to_le16(subv->tree.height), &format_sb->tree_height);
+
+ put_unaligned(cpu_to_le64(subv->id), &format_sb->origin_id);
+
+ put_unaligned(cpu_to_le64(subv->data_capacity), &format_sb->data_capacity);
+
+ if (update_disk_version_minor(format_sb)) {
+ __u32 version = PLUGIN_LIBRARY_VERSION | FORMAT40_UPDATE_BACKUP;
+
+ put_unaligned(cpu_to_le32(version), &format_sb->version);
+ }
+ /*
+ * convert on-line state to on-disk flags
+ */
+ if (subv->flags & (1 << SUBVOL_TO_BE_REMOVED))
+ format_flags |= (1 << FORMAT40_TO_BE_REMOVED);
+ else
+ format_flags &= ~(1 << FORMAT40_TO_BE_REMOVED);
+
+ if (subv->flags & (1 << SUBVOL_IS_PROXY))
+ format_flags |= (1 << FORMAT40_IS_PROXY);
+ else
+ format_flags &= ~(1 << FORMAT40_IS_PROXY);
+
+ if (subv->flags & (1 << SUBVOL_HAS_DATA_ROOM))
+ format_flags |= (1 << FORMAT40_HAS_DATA_ROOM);
+ else
+ format_flags &= ~(1 << FORMAT40_HAS_DATA_ROOM);
+
+ if (is_meta_brick(subv)) {
+ if (reiser4_volume_is_unbalanced(s))
+ format_flags |= (1 << FORMAT40_UNBALANCED_VOLUME);
+ else
+ format_flags &= ~(1 << FORMAT40_UNBALANCED_VOLUME);
+
+ put_unaligned(cpu_to_le64(vol_nr_origins(vol)), &format_sb->nr_origins);
+
+ put_unaligned(cpu_to_le64(conf->nr_mslots), &format_sb->nr_mslots);
+
+ put_unaligned(cpu_to_le64(subv->volmap_loc[CUR_VOL_CONF]), &format_sb->volinfo_loc);
+
+ put_unaligned(vol->num_sgs_bits, &format_sb->num_sgs_bits);
+ }
+ put_unaligned(cpu_to_le64(format_flags), &format_sb->flags);
+}
+
+/**
+ * ->log_super() method of disk_format40 plugin.
+ * Return a jnode which should be added to a transaction when the super block
+ * gets logged
+ */
+jnode *log_super_format40(struct super_block *super, reiser4_subvol *subv)
+{
+ jload(subv->sb_jnode);
+ pack_format40_super(super, subv, jdata(subv->sb_jnode));
+ jrelse(subv->sb_jnode);
+
+ return subv->sb_jnode;
+}
+
+/**
+ * ->release() method of disk_format40 plugin
+ */
+int release_format40(struct super_block *s, reiser4_subvol *subv)
+{
+ sa_destroy_allocator(&subv->space_allocator, s, subv);
+ reiser4_done_journal_info(subv);
+ put_sb_format_jnode(subv);
+
+ rcu_barrier();
+ reiser4_done_tree(&subv->tree);
+ /*
+ * call finish_rcu(), because some znode
+ * were "released" in reiser4_done_tree()
+ */
+ rcu_barrier();
+
+ return 0;
+}
+
+#define FORMAT40_ROOT_LOCALITY 41
+#define FORMAT40_ROOT_OBJECTID 42
+
+/**
+ * ->root_dir_key() method of disk_format40 plugin
+ */
+const reiser4_key *root_dir_key_format40(const struct super_block *super
+ UNUSED_ARG)
+{
+ static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
+ .el = {
+ __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) |
+ KEY_SD_MINOR),
+#if REISER4_LARGE_KEY
+ ON_LARGE_KEY(0ull,)
+#endif
+ __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
+ 0ull
+ }
+ };
+ return &FORMAT40_ROOT_DIR_KEY;
+}
+
+/**
+ * ->check_open() method of disk_format40 plugin
+ * Check the opened object for validness.
+ * For now it checks for the valid oid & locality only,
+ * can be improved later and it its work may depend on
+ * the mount options
+ */
+int check_open_format40(const struct inode *object)
+{
+ oid_t max, oid;
+
+ max = oid_next(object->i_sb) - 1;
+ /*
+ * Check the oid
+ */
+ oid = get_inode_oid(object);
+ if (oid > max) {
+ warning("vpf-1360", "The object with the oid %llu "
+ "greater then the max used oid %llu found.",
+ (unsigned long long)oid, (unsigned long long)max);
+
+ return RETERR(-EIO);
+ }
+ /*
+ * Check the locality
+ */
+ oid = reiser4_inode_data(object)->locality_id;
+ if (oid > max) {
+ warning("vpf-1361", "The object with the locality %llu "
+ "greater then the max used oid %llu found.",
+ (unsigned long long)oid, (unsigned long long)max);
+
+ return RETERR(-EIO);
+ }
+ return 0;
+}
+
+static int version_update_common(struct super_block *super,
+ reiser4_subvol *subv, int major)
+{
+ int ret;
+ lock_handle lh;
+
+ if (sb_rdonly(super) || subv->version >= get_release_number_minor())
+ return 0;
+
+ printk("reiser4 (%s): upgrading disk format to %d.%d.%d.\n",
+ subv->name,
+ get_format_number_principal(major),
+ major,
+ get_release_number_minor());
+
+ printk("reiser4 (%s): use 'fsck.reiser4 --fix' "
+ "to complete disk format upgrade.\n", subv->name);
+ /*
+ * Mark the uber znode dirty to call ->log_super() on write_logs
+ */
+ init_lh(&lh);
+ ret = get_uber_znode(&subv->tree, ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_HIPRI, &lh);
+ if (ret) {
+ BUG_ON(ret > 0);
+ return ret;
+ }
+ znode_make_dirty(lh.node);
+ done_lh(&lh);
+ /*
+ * Backup blocks stuff in fsck makes me queasy - Edward.
+ */
+ return 1;
+}
+
+int version_update_format40(struct super_block *super, reiser4_subvol *subv)
+{
+ return version_update_common(super, subv, 0);
+}
+
+int version_update_format41(struct super_block *super, reiser4_subvol *subv)
+{
+ return version_update_common(super, subv, 1);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format40.h
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format40.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format40.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,123 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/*
+ * Objects of Standard Disk Layout for simple volumes (i.e. volumes
+ * associated with a single physical, or logical (RAID, LVM) device.
+ */
+
+#ifndef __DISK_FORMAT40_H__
+#define __DISK_FORMAT40_H__
+
+/* magic for default reiser4 layout */
+#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
+#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_SIZE)
+
+#include "../../dformat.h"
+#include <linux/fs.h>
+
+typedef enum {
+ FORMAT40_LARGE_KEYS,
+ FORMAT40_UNBALANCED_VOLUME,
+ FORMAT40_HAS_DATA_ROOM,
+ FORMAT40_TO_BE_REMOVED,
+ FORMAT40_PLANB_KEY_ALLOC,
+ FORMAT40_IS_PROXY,
+} format40_flags;
+
+/* ondisk super block for format 40. It is 512 bytes long */
+typedef struct format40_disk_super_block {
+ /* 0 */ d64 block_count;
+ /* number of block in a filesystem */
+ /* 8 */ d64 free_blocks;
+ /* number of free blocks */
+ /* 16 */ d64 root_block;
+ /* filesystem tree root block */
+ /* 24 */ d64 oid;
+ /* smallest free objectid */
+ /* 32 */ d64 file_count;
+ /* number of files in a filesystem */
+ /* 40 */ d64 flushes;
+ /* number of times super block was
+ flushed. Needed if format 40
+ will have few super blocks */
+ /* 48 */ d32 mkfs_id;
+ /* unique identifier of fs */
+ /* 52 */ char magic[16];
+ /* magic string ReIsEr40FoRmAt */
+ /* 68 */ d16 tree_height;
+ /* height of filesystem tree */
+ /* 70 */ d16 formatting_policy;
+ /* not used anymore */
+ /* 72 */ d64 flags;
+ /* 80 */ d32 version;
+ /* on-disk format version number
+ initially assigned by mkfs as the greatest format40
+ version number supported by reiser4progs and updated
+ in mount time in accordance with the greatest format40
+ version number supported by kernel.
+ Is used by fsck to catch possible corruption and
+ for various compatibility issues */
+ /* 84 */ d32 node_pid; /* formatted node plugin id */
+
+ /* Reiser5 fields */
+ /* 88 */ d64 origin_id; /* internal ID of the subvolume. It gets assigned
+ once and never changes */
+ /* 96 */ d64 nr_origins; /* total number of original subvolumes in LV */
+ /* 104 */ d64 data_capacity;/* weight of the brick in data storage array */
+ /* 112 */ d64 volinfo_loc; /* location of the first block of system LV info */
+ /* 120 */ d8 num_sgs_bits; /* logarithm of total number of the hash-space
+ segments */
+ /* 121 */ d64 nr_mslots; /* number of mslots (== maximal brick ID + 1) */
+ /* 129 */ d64 min_occup; /* mimimal possible number of occupied blocks on
+ * the partition (reserved area at the beginning
+ * of the partition + 2 super-blocks + 1 journal
+ * footer + 1 journal header + backup blocks that
+ * kernel is not aware of, etc). This is set by
+ * mkfs.reiser4 utility and never gets changed */
+ char not_used[375];
+} __attribute__((packed)) format40_disk_super_block;
+
+/* Defines for journal header and footer respectively. */
+#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
+ ((REISER4_MASTER_OFFSET / PAGE_SIZE) + 3)
+
+#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
+ ((REISER4_MASTER_OFFSET / PAGE_SIZE) + 4)
+
+#define FORMAT40_STATUS_BLOCKNR \
+ ((REISER4_MASTER_OFFSET / PAGE_SIZE) + 5)
+
+/* Diskmap declarations */
+#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
+#define FORMAT40_SUPER 1
+#define FORMAT40_JH 2
+#define FORMAT40_JF 3
+
+/*
+ * declarations of functions implementing methods of layout plugin
+ * for format40. The functions theirself are in disk_format40.c
+ */
+extern int extract_subvol_id_format40(struct block_device *bdev, u64 *subv_id);
+extern int extract_subvol_id_format41(struct block_device *bdev, u64 *subv_id);
+extern int init_format_format40(struct super_block *, reiser4_subvol *);
+extern int init_format_format41(struct super_block *, reiser4_subvol *);
+extern const reiser4_key *root_dir_key_format40(const struct super_block *);
+extern int release_format40(struct super_block *s, reiser4_subvol *);
+extern jnode *log_super_format40(struct super_block *s, reiser4_subvol *);
+extern int check_open_format40(const struct inode *object);
+extern int version_update_format40(struct super_block *super, reiser4_subvol *);
+extern int version_update_format41(struct super_block *super, reiser4_subvol *);
+
+/* __DISK_FORMAT40_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format41.c linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format41.c
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format41.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format41.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,10 @@
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format41.h linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format41.h
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format41.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format41.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,15 @@
+#ifndef __DISK_FORMAT41_H__
+#define __DISK_FORMAT41_H__
+
+#endif /* __DISK_FORMAT41_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format.c
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,56 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../debug.h"
+#include "../plugin_header.h"
+#include "disk_format40.h"
+#include "disk_format.h"
+#include "../plugin.h"
+
+/* initialization of disk layout plugins */
+disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
+ [FORMAT40_ID] = {
+ .h = {
+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
+ .id = FORMAT40_ID,
+ .pops = NULL,
+ .label = "format40",
+ .desc = "standard disk layout for simple volumes",
+ .linkage = {NULL, NULL}
+ },
+ .extract_subvol_id = extract_subvol_id_format40,
+ .init_format = init_format_format40,
+ .root_dir_key = root_dir_key_format40,
+ .release_format = release_format40,
+ .log_super = log_super_format40,
+ .check_open = check_open_format40,
+ .version_update = version_update_format40,
+ },
+ [FORMAT41_ID] = {
+ .h = {
+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
+ .id = FORMAT41_ID,
+ .pops = NULL,
+ .label = "format41",
+ .desc = "standard disk layout for compound volumes",
+ .linkage = {NULL, NULL}
+ },
+ .extract_subvol_id = extract_subvol_id_format41,
+ .init_format = init_format_format41,
+ .root_dir_key = root_dir_key_format40,
+ .release_format = release_format40,
+ .log_super = log_super_format40,
+ .check_open = check_open_format40,
+ .version_update = version_update_format41,
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format.h
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/disk_format.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/disk_format.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,28 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* identifiers for disk layouts, they are also used as indexes in array of disk
+ plugins */
+
+#if !defined( __REISER4_DISK_FORMAT_H__ )
+#define __REISER4_DISK_FORMAT_H__
+
+typedef enum {
+ /* standard reiser4 disk layout plugin id */
+ FORMAT40_ID,
+ FORMAT41_ID,
+ LAST_FORMAT_ID
+} disk_format_id;
+
+/* __REISER4_DISK_FORMAT_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/disk_format/Makefile linux-5.10.2/fs/reiser4/plugin/disk_format/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/disk_format/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/disk_format/Makefile 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,5 @@
+obj-$(CONFIG_REISER4_FS) += df_plugins.o
+
+df_plugins-objs := \
+ disk_format40.o \
+ disk_format.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/dst.c linux-5.10.2/fs/reiser4/plugin/dst/dst.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/dst.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/dst.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,90 @@
+/*
+ Copyright (c) 2014-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include <linux/kernel.h>
+#include <asm/types.h>
+#include "../../debug.h"
+#include "../../inode.h"
+#include "../plugin.h"
+#include "../volume/volume.h"
+#include "dst.h"
+
+static u64 lookup_triv(reiser4_dcx *rdcx, const struct inode *inode,
+ const char *str, int len, u32 seed, void *tab)
+{
+ return METADATA_SUBVOL_ID;
+}
+
+distribution_plugin distribution_plugins[LAST_DISTRIB_ID] = {
+ [TRIV_DISTRIB_ID] = {
+ .h = {
+ .type_id = REISER4_DISTRIBUTION_PLUGIN_TYPE,
+ .id = TRIV_DISTRIB_ID,
+ .pops = NULL,
+ .label = "triv",
+ .desc = "Trivial Distribution",
+ .linkage = {NULL, NULL}
+ },
+ .seg_bits = 0,
+ .r = {
+ .init = NULL,
+ .lookup = lookup_triv,
+ .done = NULL,
+ },
+ .v = {
+ .init = NULL,
+ .done = NULL,
+ .inc = NULL,
+ .dec = NULL,
+ .spl = NULL,
+ .pack = NULL,
+ .unpack = NULL,
+ .dump = NULL,
+ }
+ },
+ [FSX32M_DISTRIB_ID] = {
+ .h = {
+ .type_id = REISER4_DISTRIBUTION_PLUGIN_TYPE,
+ .id = FSX32M_DISTRIB_ID,
+ .pops = NULL,
+ .label = "fsx32m",
+ .desc = "Fiber-Striping over 32-bit Murmur hash",
+ .linkage = {NULL, NULL}
+ },
+ .seg_bits = 2, /* (log(sizeof u32)) */
+ .r = {
+ .init = initr_fsx32,
+ .lookup = lookup_fsx32m,
+ .replace = replace_fsx32,
+ .free = free_fsx32,
+ .done = doner_fsx32
+ },
+ .v = {
+ .init = initv_fsx32,
+ .done = donev_fsx32,
+ .inc = inc_fsx32,
+ .dec = dec_fsx32,
+ .spl = spl_fsx32,
+ .pack = pack_fsx32,
+ .unpack = unpack_fsx32,
+ .dump = dump_fsx32,
+ }
+ },
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/dst.h linux-5.10.2/fs/reiser4/plugin/dst/dst.h
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/dst.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/dst.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,21 @@
+#ifndef DST_H
+#define DST_H
+
+#include "fsx32.h"
+
+union reiser4_dcx {
+ struct fsx32_dcx fsx32;
+};
+
+#endif /* DST_H */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/fsx32.c linux-5.10.2/fs/reiser4/plugin/dst/fsx32.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/fsx32.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/fsx32.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,975 @@
+/*
+ Balanced Fiber-Striped eXtendable array with Weights.
+ Inventor, Author: Eduard O. Shishkin
+ Implementation over 32-bit hash.
+ Adapted for use in Reiser4.
+
+ Copyright (c) 2014-2020 Eduard O. Shishkin
+*/
+
+#include <linux/kernel.h>
+#include <asm/types.h>
+#include "../../debug.h"
+#include "../../inode.h"
+#include "../plugin.h"
+#include "dst.h"
+
+#define MIN_SGS_BITS 10
+#define MAX_SGS_BITS 20
+#define MAX_BUCKETS (1u << MAX_SGS_BITS)
+#define MAX_DIFFER_BITS 19
+#define MAX_DATA_CAPACITY 0xffffffffffffffffull
+
+static inline void *fsx32_alloc(u64 len)
+{
+ void *result = reiser4_vmalloc(len * sizeof(u32));
+ if (result)
+ memset(result, 0, len * sizeof(u32));
+ return result;
+}
+
+static inline void *fsx64_alloc(u64 len)
+{
+ void *result = reiser4_vmalloc(len * sizeof(u64));
+ if (result)
+ memset(result, 0, len * sizeof(u64));
+ return result;
+}
+
+static inline void fsx_free(void *p)
+{
+ vfree(p);
+}
+
+static inline struct fsx32_dcx *fsx32_private(reiser4_dcx *dcx)
+{
+ return &dcx->fsx32;
+}
+
+static void init_apxs_by_tab(u32 numb,
+ u32 nums_bits,
+ u32 *tab,
+ bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ u32 (*id2idx)(u64 id),
+ u32 *weights)
+{
+ u32 i;
+ u32 nums = 1 << nums_bits;
+
+ for(i = 0; i < numb; i++)
+ weights[i] = 0;
+
+ for(i = 0; i < nums; i++) {
+ u32 *apx;
+
+ apx = apx_at(vec, id2idx(tab[i]));
+ apx[(weights[id2idx(tab[i])])++] = i;
+ }
+}
+
+static void init_tab_by_apxs(u32 numb,
+ u32 *tab,
+ bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ u64 (*idx2id)(u32 idx),
+ u32 *weights)
+{
+ u32 i, j;
+
+ for(i = 0; i < numb; i++)
+ for (j = 0; j < weights[i]; j++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+ tab[apx[j]] = idx2id(i);
+ }
+}
+
+u32 *init_tab_from_scratch(u32 *weights, u32 numb, u32 nums_bits,
+ u64 (*idx2id)(u32 idx))
+{
+ u32 i, j, k;
+ u32 *tab;
+ u32 nums = 1 << nums_bits;
+
+ tab = fsx32_alloc(nums);
+ if (!tab)
+ return NULL;
+ for (i = 0, k = 0; i < numb; i++)
+ for (j = 0; j < weights[i]; j++)
+ tab[k++] = idx2id(i);
+ return tab;
+}
+
+static void calibrate(u64 num, u64 val,
+ bucket_t *vec, u64 (*vec_el_get)(bucket_t *vec, u64 idx),
+ void *ret, u64 (*ret_el_get)(void *ret, u64 idx),
+ void (ret_el_set)(void *ret, u64 idx, u64 value))
+{
+ u64 i;
+ u64 rest;
+ u64 sum_scaled = 0;
+ u64 sum_not_scaled = 0;
+
+ for (i = 0; i < num; i++)
+ sum_not_scaled += vec_el_get(vec, i);
+ for (i = 0; i < num; i++) {
+ u64 q;
+ u64 result;
+
+ q = val * vec_el_get(vec, i);
+ result = div64_u64(q, sum_not_scaled);
+ ret_el_set(ret, i, result);
+ sum_scaled += result;
+ }
+ rest = val - sum_scaled;
+
+ for (i = 0; i < rest; i++)
+ ret_el_set(ret, i, ret_el_get(ret, i) + 1);
+ return;
+}
+
+static u64 array32_el_get(void *array, u64 idx)
+{
+ return ((u32 *)array)[idx];
+}
+
+static void array32_el_set(void *array, u64 idx, u64 val)
+{
+ ((u32 *)array)[idx] = val;
+}
+
+static u64 array64_el_get(void *array, u64 idx)
+{
+ return ((u64 *)array)[idx];
+}
+
+static void array64_el_set(void *array, u64 idx, u64 val)
+{
+ ((u64 *)array)[idx] = val;
+}
+
+static void calibrate32(u32 num, u32 val, bucket_t *vec,
+ u64 (*vec_el_get)(bucket_t *vec, u64 idx),
+ u32 *ret)
+{
+ calibrate(num, val, vec, vec_el_get,
+ ret, array32_el_get, array32_el_set);
+}
+
+static void calibrate64(u64 num, u64 val, bucket_t *vec,
+ u64 (*vec_el_get)(bucket_t *vec, u64 idx),
+ u64 *ret)
+{
+ calibrate(num, val, vec, vec_el_get,
+ ret, array64_el_get, array64_el_set);
+}
+
+int create_systab(u32 nums_bits, u32 **tab,
+ u32 numb, u32 *weights, bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ u64 (*idx2id)(u32 idx))
+{
+ u32 nums = 1 << nums_bits;
+
+ *tab = fsx32_alloc(nums);
+ if (!tab)
+ return -ENOMEM;
+
+ init_tab_by_apxs(numb, *tab, vec, apx_at, idx2id, weights);
+ return 0;
+}
+
+static int clone_systab(struct fsx32_dcx *dcx, const void *tab)
+{
+ assert("edward-2169", dcx != NULL);
+ assert("edward-2170", tab != NULL);
+ assert("edward-2171", dcx->tab == NULL);
+
+ dcx->tab = fsx32_alloc(1 << dcx->nums_bits);
+ if (!dcx->tab)
+ return -ENOMEM;
+ memcpy(dcx->tab, tab, (1 << dcx->nums_bits) * sizeof(u32));
+ return 0;
+}
+
+static void free_cloned_systab(struct fsx32_dcx *dcx)
+{
+ if (dcx->tab) {
+ fsx_free(dcx->tab);
+ dcx->tab = NULL;
+ }
+}
+
+static int create_apxs(u32 nums_bits, u32 *tab,
+ u32 numb, u32 *weights, bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ void (*apx_set_at)(bucket_t *vec, u64 idx, void *apx),
+ u64 *(*apx_lenp_at)(bucket_t *vec, u64 idx),
+ u32 (*id2idx)(u64 id))
+{
+ u32 i;
+ for(i = 0; i < numb; i++) {
+ u32 *apx;
+ u64 *apx_lenp;
+
+ apx = fsx32_alloc(weights[i]);
+ if (!apx)
+ return RETERR(-ENOMEM);
+ apx_set_at(vec, i, apx);
+ apx_lenp = apx_lenp_at(vec, i);
+ *apx_lenp = weights[i];
+ }
+ init_apxs_by_tab(numb,
+ nums_bits, tab, vec, apx_at, id2idx, weights);
+
+ for (i = 0; i < numb; i++)
+ assert("edward-1901",
+ weights[i] == *(apx_lenp_at(vec, i)));
+ return 0;
+}
+
+#if REISER4_DEBUG
+void print_apx(u32 id, bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ u64 *(*apx_lenp_at)(bucket_t *vec, u64 idx))
+{
+ u32 i;
+ u32 *apx = apx_at(vec, id);
+ u32 apx_len = *apx_lenp_at(vec, id);
+
+ printk("apx %d (len %d):", id, apx_len);
+ for (i = 0; i < apx_len; i++)
+ printk("%d", apx[i]);
+ printk("end of apx %d", id);
+ return;
+}
+#endif
+
+static void release_apxs(u32 numb, bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ void (*apx_set_at)(bucket_t *vec, u64 idx,
+ void *apx))
+{
+ u32 i;
+
+ for(i = 0; i < numb; i++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+ fsx_free(apx);
+ apx_set_at(vec, i, NULL);
+ }
+}
+
+static int replace_apxs(u32 nums_bits, u32 *tab,
+ u32 old_numb, u32 new_numb,
+ u32 *weights, bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ void (*apx_set_at)(bucket_t *vec, u64 idx, void *apx),
+ u64 *(*apx_lenp_at)(bucket_t *vec, u64 idx),
+ u32 (*id2idx)(u64 id))
+{
+ release_apxs(old_numb, vec, apx_at, apx_set_at);
+ return create_apxs(nums_bits, tab, new_numb, weights, vec,
+ apx_at, apx_set_at, apx_lenp_at, id2idx);
+}
+
+/**
+ * @vec: new array of abstract buckets
+ * @new: a bucket to be added
+ * @target_pos: index of @new in the @vec
+ */
+static int balance_inc(struct fsx32_dcx *dcx,
+ u32 new_numb, u32 *tab,
+ u32 *old_weights, u32 *new_weights,
+ u32 target_pos,
+ bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ u64 (*idx2id)(u32 idx),
+ bucket_t new)
+{
+ int ret = 0;
+ u32 i, j;
+ u32 *exc = NULL;
+
+ exc = fsx32_alloc(new_numb);
+ if (!exc) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ dcx->exc = exc;
+
+ for (i = 0; i < target_pos; i++)
+ exc[i] = old_weights[i] - new_weights[i];
+
+ for(i = target_pos + 1; i < new_numb; i++) {
+ if (new)
+ exc[i] = old_weights[i-1] - new_weights[i];
+ else
+ exc[i] = old_weights[i] - new_weights[i];
+ }
+ assert("edward-1910", exc[target_pos] == 0);
+
+ for(i = 0; i < target_pos; i++)
+ for(j = 0; j < exc[i]; j++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+
+ assert("edward-1902",
+ tab[apx[new_weights[i] + j]] == idx2id(i));
+
+ tab[apx[new_weights[i] + j]] = idx2id(target_pos);
+ }
+
+ for(i = target_pos + 1; i < new_numb; i++) {
+ for(j = 0; j < new_weights[i]; j++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+ assert("edward-1913", tab[apx[j]] == idx2id(i));
+ }
+ for(j = 0; j < exc[i]; j++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+ assert("edward-1914",
+ tab[apx[new_weights[i] + j]] == idx2id(i));
+ tab[apx[new_weights[i] + j]] = idx2id(target_pos);
+ }
+ }
+ exit:
+ if (exc)
+ fsx_free(exc);
+ return ret;
+}
+
+/**
+ * @vec: new array of abstract buckets
+ * @removeme: bucket to be removed
+ * @target_pos: index (in @vec) of @removeme
+ */
+static int balance_dec(struct fsx32_dcx *dcx,
+ u32 new_numb, u32 *tab,
+ u32 *old_weights, u32 *new_weights,
+ u32 target_pos,
+ bucket_t *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ void *(*apx_of)(bucket_t bucket),
+ u64 (*idx2id)(u32 idx),
+ bucket_t removeme)
+{
+ int ret = 0;
+ u32 i, j;
+ u32 off_in_target = 0;
+ u32 *sho;
+ u32 *target;
+
+ sho = fsx32_alloc(new_numb);
+ if (!sho) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ dcx->sho = sho;
+
+ for(i = 0; i < target_pos; i++)
+ sho[i] = new_weights[i] - old_weights[i];
+
+ for(i = target_pos; i < new_numb; i++) {
+ if (removeme)
+ sho[i] = new_weights[i] - old_weights[i+1];
+ else
+ sho[i] = new_weights[i] - old_weights[i];
+ }
+
+ if (removeme) {
+ target = apx_of(removeme);
+ off_in_target = 0;
+ } else {
+ target = apx_at(vec, target_pos);
+ off_in_target = new_weights[target_pos];
+ }
+ /*
+ * distribute segments among all apxs to the left of target_pos
+ */
+ for(i = 0; i < target_pos; i++)
+ for(j = 0; j < sho[i]; j++) {
+ tab[target[off_in_target ++]] = idx2id(i);
+ }
+ /*
+ * distribute segments among all apxs to the right of target_pos
+ */
+ if (removeme)
+ for(i = target_pos; i < new_numb; i++) {
+ for(j = 0; j < sho[i]; j++) {
+ tab[target[off_in_target ++]] = idx2id(i);
+ }
+ }
+ else
+ for(i = target_pos + 1; i < new_numb; i++) {
+ for(j = 0; j < sho[i]; j++) {
+ tab[target[off_in_target ++]] = idx2id(i);
+ }
+ }
+ exit:
+ if (sho)
+ fsx_free(sho);
+ return ret;
+}
+
+static int balance_spl(u32 numb, u32 nums_bits,
+ const u32 *old_tab,
+ u32 **result,
+ u32 *old_weights, u32 *new_weights,
+ u32 fact_bits,
+ void *vec,
+ void *(*apx_at)(bucket_t *vec, u64 idx),
+ void (*apx_set_at)(bucket_t *vec,
+ u64 idx, void *apx),
+ u64 *(*apx_lenp_at)(bucket_t *vec, u64 idx),
+ u32 (*id2idx)(u64 id), u64 (*idx2id)(u32 idx))
+{
+ u32 ret = 0;
+ u32 i,j,k = 0;
+ u32 nums;
+
+ u32 *tab = NULL;
+ u32 *exc = NULL;
+ u32 num_exc;
+ u32 *sho = NULL;
+ u32 num_sho;
+ u32 *reloc = NULL;
+ u32 num_reloc;
+ u32 factor;
+
+ assert("edward-1904", numb <= MAX_BUCKETS);
+
+ if (nums_bits + fact_bits > MAX_SGS_BITS) {
+ warning("edward-2399",
+ "Scale factor %u is too large", 1 << fact_bits);
+ return -EINVAL;
+ }
+
+ nums = 1 << nums_bits;
+ factor = 1 << fact_bits;
+
+ num_exc = (nums * factor) % numb;
+ num_sho = numb - num_exc;
+
+ if (num_exc) {
+ exc = fsx32_alloc(numb);
+ if (!exc)
+ goto error;
+
+ sho = exc + num_exc;
+
+ for(i = 0; i < num_exc; i++) {
+ assert("edward-2400",
+ factor * old_weights[i] >= new_weights[i]);
+
+ exc[i] = factor * old_weights[i] - new_weights[i];
+ }
+ for(i = 0; i < num_sho; i++) {
+ assert("edward-2401",
+ new_weights[i + num_exc] >=
+ factor * old_weights[i + num_exc]);
+
+ sho[i] = new_weights[i + num_exc] -
+ factor * old_weights[i + num_exc];
+ }
+ }
+ tab = fsx32_alloc(nums * factor);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ for(i = 0; i < nums; i++)
+ for(j = 0; j < factor; j++)
+ tab[i * factor + j] = old_tab[i];
+ if (!num_exc)
+ goto release;
+
+ for (i = 0; i < numb; i++)
+ old_weights[i] *= factor;
+
+ ret = replace_apxs(nums_bits + fact_bits, tab,
+ numb, numb, old_weights, vec,
+ apx_at, apx_set_at, apx_lenp_at, id2idx);
+ if (ret)
+ goto error;
+
+ for (i = 0, num_reloc = 0; i < num_exc; i++)
+ num_reloc += exc[i];
+
+ if (num_reloc == 0)
+ goto release;
+
+ reloc = fsx32_alloc(num_reloc);
+ if (!reloc) {
+ ret = RETERR(-EINVAL);
+ goto error;
+ }
+ for (i = 0, k = 0; i < num_exc; i++)
+ for (j = 0; j < exc[i]; j++) {
+ u32 *apx;
+ apx = apx_at(vec, i);
+ reloc[k++] = apx[new_weights[i] + j];
+ }
+ for (i = 0, k = 0; i < num_sho; i++)
+ for (j = 0; j < sho[i]; j++)
+ tab[reloc[k++]] = idx2id(num_exc + i);
+ release:
+ release_apxs(numb, vec, apx_at, apx_set_at);
+ *result = tab;
+ goto exit;
+ error:
+ if (tab)
+ fsx_free(tab);
+ exit:
+ if (exc)
+ fsx_free(exc);
+ if (reloc)
+ fsx_free(reloc);
+ return ret;
+}
+
+void donev_fsx32(reiser4_dcx *rdcx)
+{
+ struct fsx32_dcx *dcx;
+
+ dcx = fsx32_private(rdcx);
+
+ if (dcx->weights != NULL) {
+ fsx_free(dcx->weights);
+ dcx->weights = NULL;
+ }
+}
+
+/**
+ * Set newly created distribution table to @target
+ */
+void replace_fsx32(reiser4_dcx *rdcx, void **target)
+{
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+
+ assert("edward-2236", target != NULL);
+ assert("edward-2237", *target == NULL);
+
+ *target = dcx->tab;
+ dcx->tab = NULL;
+}
+
+void free_fsx32(void *tab)
+{
+ assert("edward-2238", tab != NULL);
+ fsx_free(tab);
+}
+
+/**
+ * Initialize distribution context for regular file operations
+ */
+int initr_fsx32(reiser4_dcx *rdcx, void **tab, int nums_bits)
+{
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+
+ if (*tab != NULL)
+ return 0;
+
+ if (nums_bits < MIN_SGS_BITS) {
+ warning("edward-1953",
+ "Bad number of hash space segments (%llu). "
+ "It should be not less than %llu",
+ 1ull << nums_bits, 1ull << MIN_SGS_BITS);
+ return -EINVAL;
+ }
+ *tab = fsx32_alloc(1 << nums_bits);
+ if (*tab == NULL)
+ return -ENOMEM;
+
+ dcx->nums_bits = nums_bits;
+ return 0;
+}
+
+void doner_fsx32(void **tab)
+{
+ assert("edward-2260", tab != NULL);
+
+ if (*tab) {
+ fsx_free(*tab);
+ *tab = NULL;
+ }
+}
+
+/**
+ * Initialize distribution context for volume operations
+ *
+ * @buckets: set of abstract buckets;
+ * @ops: operations to access the buckets;
+ * @rdcx: distribution context to be initialized.
+ */
+int initv_fsx32(void **tab, u64 numb, int nums_bits,
+ reiser4_dcx *rdcx)
+{
+ int ret = -ENOMEM;
+ u32 nums;
+ struct fsx32_dcx *dcx;
+ struct bucket_ops *ops = current_bucket_ops();
+
+ if (numb == 0 || nums_bits >= MAX_SGS_BITS)
+ return -EINVAL;
+
+ nums = 1 << nums_bits;
+ if (numb >= nums)
+ return -EINVAL;
+
+ dcx = fsx32_private(rdcx);
+
+ assert("edward-2172", dcx->tab == NULL);
+ assert("edward-1922", dcx->weights == NULL);
+ assert("edward-2261", tab != NULL);
+ assert("edward-2336", current_buckets() != NULL);
+
+ dcx->numb = numb;
+ dcx->weights = fsx32_alloc(numb);
+ if (!dcx->weights)
+ goto error;
+
+ calibrate32(numb, nums, current_buckets(),
+ ops->cap_at, dcx->weights);
+
+ if (*tab == NULL) {
+ u32 i;
+ assert("edward-2201", numb == 1);
+
+ ret = initr_fsx32(rdcx, tab, nums_bits);
+ if (ret)
+ goto error;
+ for (i = 0; i < nums; i++)
+ (*(u32 **)tab)[i] = ops->idx2id(0);
+ }
+ assert("edward-2173", *tab != NULL);
+
+ ret = create_apxs(nums_bits, *tab,
+ numb, dcx->weights, current_buckets(),
+ ops->apx_at,
+ ops->apx_set_at,
+ ops->apx_lenp_at,
+ ops->id2idx);
+ if (ret)
+ goto error;
+ return 0;
+ error:
+ doner_fsx32(tab);
+ donev_fsx32(rdcx);
+ return ret;
+}
+
+u64 lookup_fsx32m(reiser4_dcx *rdcx, const struct inode *inode,
+ const char *str, int len, u32 seed, void *tab)
+{
+ u32 hash;
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+
+ hash = murmur3_x86_32(str, len, seed);
+ return ((u32 *)tab)[hash >> (32 - dcx->nums_bits)];
+}
+
+static int check_maxdiff(reiser4_dcx *rdcx, u64 numb)
+{
+ u64 i;
+ u64 min = MAX_DATA_CAPACITY;
+ u64 max = 0;
+ bucket_t *vec = current_buckets();
+ struct bucket_ops *ops = current_bucket_ops();
+
+ assert("edward-2390", numb >= 1);
+
+ for (i = 0; i < numb; i++) {
+ if (min > ops->cap_at(vec, i))
+ min = ops->cap_at(vec, i);
+ if (max < ops->cap_at(vec, i))
+ max = ops->cap_at(vec, i);
+ }
+ assert("edward-2391", min != 0);
+
+ if ((div64_u64(max, min)) >> MAX_DIFFER_BITS != 0) {
+ warning("edward-2392",
+ "Capacities %llu and %llu differ too much",
+ min, max);
+ return RETERR(-EINVAL);
+ }
+ return 0;
+}
+
+int inc_fsx32(reiser4_dcx *rdcx, const void *tab, u64 target_pos, bucket_t new)
+{
+ int ret = 0;
+ u32 *new_weights;
+ u32 old_numb, new_numb, nums;
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+ struct bucket_ops *ops = current_bucket_ops();
+
+ new_numb = old_numb = dcx->numb;
+ if (new) {
+ if (old_numb == MAX_BUCKETS)
+ return -EINVAL;
+ new_numb ++;
+ }
+ nums = 1 << dcx->nums_bits;
+ if (new_numb > nums) {
+ warning("edward-2337",
+ "Can not add bucket: current limit (%u) reached",
+ nums);
+ return -EINVAL;
+ }
+ ret = check_maxdiff(rdcx, new_numb);
+ if (ret)
+ return ret;
+ new_weights = fsx32_alloc(new_numb);
+ if (!new_weights) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ dcx->new_weights = new_weights;
+
+ ret = clone_systab(dcx, tab);
+ if (ret)
+ goto error;
+
+ calibrate32(new_numb, nums,
+ current_buckets(), ops->cap_at, new_weights);
+ ret = balance_inc(dcx,
+ new_numb, dcx->tab,
+ dcx->weights, new_weights, target_pos,
+ current_buckets(), ops->apx_at,
+ ops->idx2id, new);
+ if (ret)
+ goto error;
+
+ release_apxs(new_numb, current_buckets(),
+ ops->apx_at, ops->apx_set_at);
+
+ fsx_free(dcx->weights);
+ dcx->weights = new_weights;
+ dcx->numb = new_numb;
+
+ return 0;
+ error:
+ if (new_weights)
+ fsx_free(new_weights);
+ free_cloned_systab(dcx);
+ return ret;
+}
+
+/**
+ * Check if there is enough capacity on abstract buckets
+ * for successful completion of an operation.
+ *
+ * @numb: number of buckets upon succesfull completion.
+ * @occ: total amount of space occupied on all buckets
+ */
+static int check_leftovers(reiser4_dcx *rdcx, u64 numb, u64 occ)
+{
+ u64 i;
+ int ret = 0;
+ u64 *vec_new_occ;
+ bucket_t *vec = current_buckets();
+ struct bucket_ops *ops = current_bucket_ops();
+ /*
+ * For each bucket calculate how much space will be
+ * occupied on that bucket after successful completion
+ * of the volume operation and compare it with the
+ * bucket's capacity
+ */
+ vec_new_occ = fsx64_alloc(numb);
+ if (!vec_new_occ)
+ return -ENOMEM;
+
+ calibrate64(numb, occ, vec, ops->cap_at, vec_new_occ);
+
+ for (i = 0; i < numb; i++) {
+ u64 cap;
+ ON_DEBUG(notice("edward-2145",
+ "Brick %llu: data capacity: %llu, min required: %llu",
+ i, ops->cap_at(vec, i), vec_new_occ[i]));
+
+ cap = ops->cap_at(vec, i);
+ cap -= (cap * 5)/100; /* deduct 5% reservation */
+ if (cap < vec_new_occ[i]) {
+ warning("edward-2070",
+ "Not enough data capacity (%llu) of brick %llu (required %llu)",
+ cap,
+ i,
+ vec_new_occ[i]);
+ ret = -ENOSPC;
+ break;
+ } else {
+ ON_DEBUG(notice("edward-2145",
+ "Brick %llu: data capacity: %llu, min required: %llu",
+ i, cap, vec_new_occ[i]));
+ }
+ }
+ fsx_free(vec_new_occ);
+ return ret;
+}
+
+int dec_fsx32(reiser4_dcx *rdcx, const void *tab, u64 target_pos,
+ bucket_t removeme)
+{
+ int ret = 0;
+ u32 nums;
+ u32 new_numb;
+ u32 *new_weights = NULL;
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+ struct bucket_ops *ops = current_bucket_ops();
+
+ assert("edward-1908", dcx->numb >= 1);
+ assert("edward-1909", dcx->numb <= MAX_BUCKETS);
+ assert("edward-1927", dcx->numb > 1);
+
+ new_numb = dcx->numb;
+ if (removeme)
+ new_numb --;
+ else {
+ ret = check_maxdiff(rdcx, new_numb);
+ if (ret)
+ return ret;
+ }
+ ret = check_leftovers(rdcx, new_numb, ops->space_occupied());
+ if (ret)
+ return ret;
+
+ nums = 1 << dcx->nums_bits;
+ new_weights = fsx32_alloc(new_numb);
+ if (!new_weights) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ dcx->new_weights = new_weights;
+
+ ret = clone_systab(dcx, tab);
+ if (ret)
+ goto error;
+
+ calibrate32(new_numb, nums,
+ current_buckets(), ops->cap_at, new_weights);
+
+ ret = balance_dec(dcx,
+ new_numb, dcx->tab,
+ dcx->weights, new_weights, target_pos,
+ current_buckets(), ops->apx_at,
+ ops->apx_of, ops->idx2id,
+ removeme);
+ if (ret)
+ goto error;
+
+ release_apxs(new_numb,
+ current_buckets(), ops->apx_at,
+ ops->apx_set_at);
+ if (removeme)
+ release_apxs(1,
+ &removeme, ops->apx_at,
+ ops->apx_set_at);
+ fsx_free(dcx->weights);
+ dcx->weights = new_weights;
+ dcx->numb = new_numb;
+ return 0;
+ error:
+ if (new_weights)
+ fsx_free(new_weights);
+ free_cloned_systab(dcx);
+ return ret;
+}
+
+int spl_fsx32(reiser4_dcx *rdcx, const void *tab, u32 fact_bits)
+{
+ int ret = 0;
+ u32 *new_weights;
+ u32 new_nums;
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+ struct bucket_ops *ops = current_bucket_ops();
+
+ if (dcx->nums_bits + fact_bits > MAX_SGS_BITS)
+ return -EINVAL;
+
+ new_nums = 1 << (dcx->nums_bits + fact_bits);
+
+ new_weights = fsx32_alloc(dcx->numb);
+ if (!new_weights) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ calibrate32(dcx->numb, new_nums,
+ current_buckets(), ops->cap_at, new_weights);
+ ret = balance_spl(dcx->numb, dcx->nums_bits,
+ tab,
+ &dcx->tab,
+ dcx->weights,
+ new_weights,
+ fact_bits,
+ current_buckets(),
+ ops->apx_at,
+ ops->apx_set_at,
+ ops->apx_lenp_at,
+ ops->id2idx,
+ ops->idx2id);
+ if (ret)
+ goto error;
+ fsx_free(dcx->weights);
+ dcx->weights = new_weights;
+ dcx->nums_bits += fact_bits;
+ return 0;
+ error:
+ if (new_weights)
+ fsx_free(new_weights);
+ return ret;
+}
+
+void pack_fsx32(reiser4_dcx *rdcx, char *to, u64 src_off, u64 count)
+{
+ u64 i;
+ u32 *src;
+ struct fsx32_dcx *dcx = fsx32_private(rdcx);
+
+ assert("edward-1923", to != NULL);
+ assert("edward-1924", dcx->tab != NULL);
+
+ src = dcx->tab + src_off;
+
+ for (i = 0; i < count; i++) {
+ put_unaligned(cpu_to_le32(*src), (d32 *)to);
+ to += sizeof(u32);
+ src ++;
+ }
+}
+
+void unpack_fsx32(reiser4_dcx *rdcx, void *tab,
+ char *from, u64 dst_off, u64 count)
+{
+ u64 i;
+ u32 *dst;
+
+ assert("edward-1925", from != NULL);
+ assert("edward-1926", tab != NULL);
+
+ dst = (u32 *)tab + dst_off;
+
+ for (i = 0; i < count; i++) {
+ *dst = le32_to_cpu(get_unaligned((d32 *)from));
+ from += sizeof(u32);
+ dst ++;
+ }
+}
+
+void dump_fsx32(reiser4_dcx *rdcx, void *tab, char *to, u64 offset, u32 size)
+{
+ memcpy(to, (u32 *)tab + offset, size);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/fsx32.h linux-5.10.2/fs/reiser4/plugin/dst/fsx32.h
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/fsx32.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/fsx32.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,53 @@
+/*
+ Copyright (c) 2014-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef FSX32_H
+#define FSX32_H
+
+struct fsx32_dcx {
+ u64 numb; /* number of abstract buckets */
+ u32 nums_bits; /* logarithm of number of hash space segments */
+ u32 *tab; /* system table */
+ u32 *weights; /* array of weights */
+ u32 *new_weights;
+ u32 *sho;
+ u32 *exc;
+};
+
+extern u32 murmur3_x86_32(const char *data, int len, int seed);
+
+extern int initr_fsx32(reiser4_dcx *rdcx, void **tab, int nums_bits);
+extern reiser4_subvol *dst_builtin(const struct inode *inode, loff_t offset);
+extern void replace_fsx32(reiser4_dcx *rdcx, void **target);
+extern void free_fsx32(void *tab);
+extern void doner_fsx32(void **tab);
+extern int initv_fsx32(void **tab, u64 numb, int nums_bits, reiser4_dcx *rdcx);
+extern void donev_fsx32(reiser4_dcx *rdcx);
+extern u64 lookup_fsx32m(reiser4_dcx *rdcx, const struct inode *inode,
+ const char *str, int len, u32 seed, void *tab);
+extern int inc_fsx32(reiser4_dcx *rdcx, const void *tab, u64 pos, bucket_t new);
+extern int dec_fsx32(reiser4_dcx *rdcx, const void *tab, u64 pos, bucket_t victim);
+extern int spl_fsx32(reiser4_dcx *rdcx, const void *tab, u32 fact_bits);
+extern void pack_fsx32(reiser4_dcx *rdcx, char *to, u64 src_off, u64 count);
+extern void unpack_fsx32(reiser4_dcx *rdcx, void *tab,
+ char *from, u64 dst_off, u64 count);
+extern void dump_fsx32(reiser4_dcx *rdcx, void *tab,
+ char *to, u64 offset, u32 size);
+#endif /* FSX32_H */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/hash.c linux-5.10.2/fs/reiser4/plugin/dst/hash.c
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/hash.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/hash.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,96 @@
+/*
+ * Adopted for using by Reiser4 distribution plugin
+ *
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public
+ * domain. The author hereby disclaims copyright to this source code.
+ */
+
+#include <asm/types.h>
+
+inline u32 rotl32 ( u32 x, s8 r )
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+inline u64 rotl64 ( u64 x, s8 r )
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x,y) rotl32(x,y)
+#define ROTL64(x,y) rotl64(x,y)
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static inline u32 fmix ( u32 h )
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//-----------------------------------------------------------------------------
+
+u32 murmur3_x86_32(const void * key, int len, u32 seed)
+{
+ const u8 * data = (const u8*)key;
+ const int nblocks = len / 4;
+
+ u32 h1 = seed;
+
+ u32 c1 = 0xcc9e2d51;
+ u32 c2 = 0x1b873593;
+
+ /* body */
+
+ const u8 * tail;
+ u32 k1;
+ const u32 * blocks = (const u32 *)(data + nblocks*4);
+ int i;
+
+ for(i = -nblocks; i; i++) {
+ u32 k = blocks[i];
+
+ k *= c1;
+ k = ROTL32(k,15);
+ k *= c2;
+
+ h1 ^= k;
+ h1 = ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ /* tail */
+
+ tail = (const u8*)(data + nblocks*4);
+
+ k1 = 0;
+
+ switch(len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ /* fall through */
+ case 2:
+ k1 ^= tail[1] << 8;
+ /* fall through */
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ /* finalization */
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ return h1;
+}
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/dst/Makefile linux-5.10.2/fs/reiser4/plugin/dst/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/dst/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/dst/Makefile 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,5 @@
+obj-$(CONFIG_REISER4_FS) += distribution_plugins.o
+
+distribution_plugins-objs := \
+ hash.o \
+ fsx32.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/fibration.c linux-5.10.2/fs/reiser4/plugin/fibration.c
--- linux-5.10.2.orig/fs/reiser4/plugin/fibration.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/fibration.c 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,175 @@
+/* Copyright 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Directory fibrations */
+
+/*
+ * Suppose we have a directory tree with sources of some project. During
+ * compilation .o files are created within this tree. This makes access
+ * to the original source files less efficient, because source files are
+ * now "diluted" by object files: default directory plugin uses prefix
+ * of a file name as a part of the key for directory entry (and this
+ * part is also inherited by the key of file body). This means that
+ * foo.o will be located close to foo.c and foo.h in the tree.
+ *
+ * To avoid this effect directory plugin fill highest 7 (unused
+ * originally) bits of the second component of the directory entry key
+ * by bit-pattern depending on the file name (see
+ * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
+ * "fibre". Fibre of the file name key is inherited by key of stat data
+ * and keys of file body (in the case of REISER4_LARGE_KEY).
+ *
+ * Fibre for a given file is chosen by per-directory fibration
+ * plugin. Names within given fibre are ordered lexicographically.
+ */
+
+#include "../debug.h"
+#include "plugin_header.h"
+#include "plugin.h"
+#include "../super.h"
+#include "../inode.h"
+
+#include <linux/types.h>
+
+static const int fibre_shift = 57;
+
+#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
+
+/*
+ * Trivial fibration: all files of directory are just ordered
+ * lexicographically.
+ */
+static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
+{
+ return FIBRE_NO(0);
+}
+
+/*
+ * dot-o fibration: place .o files after all others.
+ */
+static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
+{
+ /* special treatment for .*\.o */
+ if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
+ return FIBRE_NO(1);
+ else
+ return FIBRE_NO(0);
+}
+
+/*
+ * ext.1 fibration: subdivide directory into 128 fibrations one for each
+ * 7bit extension character (file "foo.h" goes into fibre "h"), plus
+ * default fibre for the rest.
+ */
+static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
+{
+ if (len > 2 && name[len - 2] == '.')
+ return FIBRE_NO(name[len - 1]);
+ else
+ return FIBRE_NO(0);
+}
+
+/*
+ * ext.3 fibration: try to separate files with different 3-character
+ * extensions from each other.
+ */
+static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
+{
+ if (len > 4 && name[len - 4] == '.')
+ return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
+ else
+ return FIBRE_NO(0);
+}
+
+static int change_fibration(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ int result;
+
+ assert("nikita-3503", inode != NULL);
+ assert("nikita-3504", plugin != NULL);
+
+ assert("nikita-3505", is_reiser4_inode(inode));
+ assert("nikita-3506", inode_dir_plugin(inode) != NULL);
+ assert("nikita-3507",
+ plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
+
+ result = 0;
+ if (inode_fibration_plugin(inode) == NULL ||
+ inode_fibration_plugin(inode)->h.id != plugin->h.id) {
+ if (is_dir_empty(inode) == 0)
+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
+ PSET_FIBRATION, plugin);
+ else
+ result = RETERR(-ENOTEMPTY);
+
+ }
+ return result;
+}
+
+static reiser4_plugin_ops fibration_plugin_ops = {
+ .init = NULL,
+ .load = NULL,
+ .save_len = NULL,
+ .save = NULL,
+ .change = change_fibration
+};
+
+/* fibration plugins */
+fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
+ [FIBRATION_LEXICOGRAPHIC] = {
+ .h = {
+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
+ .id = FIBRATION_LEXICOGRAPHIC,
+ .pops = &fibration_plugin_ops,
+ .label = "lexicographic",
+ .desc = "no fibration",
+ .linkage = {NULL, NULL}
+ },
+ .fibre = fibre_trivial
+ },
+ [FIBRATION_DOT_O] = {
+ .h = {
+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
+ .id = FIBRATION_DOT_O,
+ .pops = &fibration_plugin_ops,
+ .label = "dot-o",
+ .desc = "fibrate .o files separately",
+ .linkage = {NULL, NULL}
+ },
+ .fibre = fibre_dot_o
+ },
+ [FIBRATION_EXT_1] = {
+ .h = {
+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
+ .id = FIBRATION_EXT_1,
+ .pops = &fibration_plugin_ops,
+ .label = "ext-1",
+ .desc = "fibrate file by single character extension",
+ .linkage = {NULL, NULL}
+ },
+ .fibre = fibre_ext_1
+ },
+ [FIBRATION_EXT_3] = {
+ .h = {
+ .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
+ .id = FIBRATION_EXT_3,
+ .pops = &fibration_plugin_ops,
+ .label = "ext-3",
+ .desc = "fibrate file by three character extension",
+ .linkage = {NULL, NULL}
+ },
+ .fibre = fibre_ext_3
+ }
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/fibration.h linux-5.10.2/fs/reiser4/plugin/fibration.h
--- linux-5.10.2.orig/fs/reiser4/plugin/fibration.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/fibration.h 2020-12-23 16:07:46.122813187 +0100
@@ -0,0 +1,37 @@
+/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Fibration plugin used by hashed directory plugin to segment content
+ * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
+
+#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__)
+#define __FS_REISER4_PLUGIN_FIBRATION_H__
+
+#include "plugin_header.h"
+
+typedef struct fibration_plugin {
+ /* generic fields */
+ plugin_header h;
+
+ __u64(*fibre) (const struct inode *dir, const char *name, int len);
+} fibration_plugin;
+
+typedef enum {
+ FIBRATION_LEXICOGRAPHIC,
+ FIBRATION_DOT_O,
+ FIBRATION_EXT_1,
+ FIBRATION_EXT_3,
+ LAST_FIBRATION_ID
+} reiser4_fibration_id;
+
+/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/cryptcompress.c linux-5.10.2/fs/reiser4/plugin/file/cryptcompress.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/cryptcompress.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/cryptcompress.c 2020-12-23 16:07:46.123813202 +0100
@@ -0,0 +1,3821 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ reiser4/README */
+/*
+ * Written by Edward Shishkin.
+ *
+ * Implementations of inode/file/address_space operations
+ * specific for cryptcompress file plugin which manages
+ * regular files built of compressed and(or) encrypted bodies.
+ * See http://dev.namesys.com/CryptcompressPlugin for details.
+ */
+
+#include "../../inode.h"
+#include "../cluster.h"
+#include "../object.h"
+#include "../../tree_walk.h"
+#include "cryptcompress.h"
+
+#include <linux/pagevec.h>
+#include <asm/uaccess.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+
+/*
+ Managing primary and secondary caches by Reiser4
+ cryptcompress file plugin. Synchronization scheme.
+
+
+ +------------------+
+ +------------------->| tfm stream |
+ | | (compressed data)|
+ flush | +------------------+
+ +-----------------+ |
+ |(->)longterm lock| V
+--+ writepages() | | +-***-+ reiser4 +---+
+ | | +--+ | *** | storage tree | |
+ | | | +-***-+ (primary cache)| |
+u | write() (secondary| cache) V / | \ | |
+s | ----> +----+ +----+ +----+ +----+ +-***** ******* **----+ ----> | d |
+e | | | |page cluster | | | **disk cluster** | | i |
+r | <---- +----+ +----+ +----+ +----+ +-***** **********----+ <---- | s |
+ | read() ^ ^ | | k |
+ | | (->)longterm lock| | page_io()| |
+ | | +------+ | |
+--+ readpages() | | +---+
+ | V
+ | +------------------+
+ +--------------------| tfm stream |
+ | (plain text) |
+ +------------------+
+*/
+
+/* get cryptcompress specific portion of inode */
+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
+{
+ return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
+}
+
+void init_inode_data_cryptcompress(struct inode *inode,
+ reiser4_object_create_data * crd,
+ const reiser4_key *sd_key, int create)
+{
+ struct cryptcompress_info *data;
+
+ data = cryptcompress_inode_data(inode);
+ assert("edward-685", data != NULL);
+
+ memset(data, 0, sizeof(*data));
+
+ mutex_init(&data->checkin_mutex);
+ data->trunc_index = ULONG_MAX;
+ turn_on_compression(data);
+ set_lattice_factor(data, MIN_LATTICE_FACTOR);
+ init_inode_ordering(inode, crd, sd_key, create);
+}
+
+/* The following is a part of reiser4 cipher key manager
+ which is called when opening/creating a cryptcompress file */
+
+/* get/set cipher key info */
+struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
+{
+ assert("edward-90", inode != NULL);
+ assert("edward-91", reiser4_inode_data(inode) != NULL);
+ return cryptcompress_inode_data(inode)->crypt;
+}
+
+static void set_inode_crypto_info (struct inode * inode,
+ struct reiser4_crypto_info * info)
+{
+ cryptcompress_inode_data(inode)->crypt = info;
+}
+
+/* allocate a cipher key info */
+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
+{
+ struct reiser4_crypto_info *info;
+ int fipsize;
+
+ info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ fipsize = inode_digest_plugin(inode)->fipsize;
+ info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
+ if (!info->keyid) {
+ kfree(info);
+ return ERR_PTR(-ENOMEM);
+ }
+ info->host = inode;
+ return info;
+}
+
+#if 0
+/* allocate/free low-level info for cipher and digest
+ transforms */
+static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
+{
+ struct crypto_blkcipher * ctfm = NULL;
+ struct crypto_hash * dtfm = NULL;
+ cipher_plugin * cplug = inode_cipher_plugin(info->host);
+ digest_plugin * dplug = inode_digest_plugin(info->host);
+
+ if (cplug->alloc) {
+ ctfm = cplug->alloc();
+ if (IS_ERR(ctfm)) {
+ warning("edward-1364",
+ "Can not allocate info for %s\n",
+ cplug->h.desc);
+ return RETERR(PTR_ERR(ctfm));
+ }
+ }
+ info_set_cipher(info, ctfm);
+ if (dplug->alloc) {
+ dtfm = dplug->alloc();
+ if (IS_ERR(dtfm)) {
+ warning("edward-1365",
+ "Can not allocate info for %s\n",
+ dplug->h.desc);
+ goto unhappy_with_digest;
+ }
+ }
+ info_set_digest(info, dtfm);
+ return 0;
+ unhappy_with_digest:
+ if (cplug->free) {
+ cplug->free(ctfm);
+ info_set_cipher(info, NULL);
+ }
+ return RETERR(PTR_ERR(dtfm));
+}
+#endif
+
+static void
+free_crypto_tfms(struct reiser4_crypto_info * info)
+{
+ assert("edward-1366", info != NULL);
+ if (!info_get_cipher(info)) {
+ assert("edward-1601", !info_get_digest(info));
+ return;
+ }
+ inode_cipher_plugin(info->host)->free(info_get_cipher(info));
+ info_set_cipher(info, NULL);
+ inode_digest_plugin(info->host)->free(info_get_digest(info));
+ info_set_digest(info, NULL);
+ return;
+}
+
+#if 0
+/* create a key fingerprint for disk stat-data */
+static int create_keyid (struct reiser4_crypto_info * info,
+ struct reiser4_crypto_data * data)
+{
+ int ret = -ENOMEM;
+ size_t blk, pad;
+ __u8 * dmem;
+ __u8 * cmem;
+ struct hash_desc ddesc;
+ struct blkcipher_desc cdesc;
+ struct scatterlist sg;
+
+ assert("edward-1367", info != NULL);
+ assert("edward-1368", info->keyid != NULL);
+
+ ddesc.tfm = info_get_digest(info);
+ ddesc.flags = 0;
+ cdesc.tfm = info_get_cipher(info);
+ cdesc.flags = 0;
+
+ dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
+ reiser4_ctx_gfp_mask_get());
+ if (!dmem)
+ goto exit1;
+
+ blk = crypto_blkcipher_blocksize(cdesc.tfm);
+
+ pad = data->keyid_size % blk;
+ pad = (pad ? blk - pad : 0);
+
+ cmem = kmalloc((size_t)data->keyid_size + pad,
+ reiser4_ctx_gfp_mask_get());
+ if (!cmem)
+ goto exit2;
+ memcpy(cmem, data->keyid, data->keyid_size);
+ memset(cmem + data->keyid_size, 0, pad);
+
+ sg_init_one(&sg, cmem, data->keyid_size + pad);
+
+ ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
+ data->keyid_size + pad);
+ if (ret) {
+ warning("edward-1369",
+ "encryption failed flags=%x\n", cdesc.flags);
+ goto exit3;
+ }
+ ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
+ if (ret) {
+ warning("edward-1602",
+ "digest failed flags=%x\n", ddesc.flags);
+ goto exit3;
+ }
+ memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
+ exit3:
+ kfree(cmem);
+ exit2:
+ kfree(dmem);
+ exit1:
+ return ret;
+}
+#endif
+
+static void destroy_keyid(struct reiser4_crypto_info * info)
+{
+ assert("edward-1370", info != NULL);
+ assert("edward-1371", info->keyid != NULL);
+ kfree(info->keyid);
+ return;
+}
+
+static void __free_crypto_info (struct inode * inode)
+{
+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
+ assert("edward-1372", info != NULL);
+
+ free_crypto_tfms(info);
+ destroy_keyid(info);
+ kfree(info);
+}
+
+#if 0
+static void instantiate_crypto_info(struct reiser4_crypto_info * info)
+{
+ assert("edward-1373", info != NULL);
+ assert("edward-1374", info->inst == 0);
+ info->inst = 1;
+}
+#endif
+
+static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
+{
+ assert("edward-1375", info != NULL);
+ info->inst = 0;
+}
+
+#if 0
+static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
+{
+ return info->inst;
+}
+
+static int inode_has_cipher_key(struct inode * inode)
+{
+ assert("edward-1376", inode != NULL);
+ return inode_crypto_info(inode) &&
+ is_crypto_info_instantiated(inode_crypto_info(inode));
+}
+#endif
+
+static void free_crypto_info (struct inode * inode)
+{
+ uninstantiate_crypto_info(inode_crypto_info(inode));
+ __free_crypto_info(inode);
+}
+
+static int need_cipher(struct inode * inode)
+{
+ return inode_cipher_plugin(inode) !=
+ cipher_plugin_by_id(NONE_CIPHER_ID);
+}
+
+/* Parse @data which contains a (uninstantiated) cipher key imported
+ from user space, create a low-level cipher info and attach it to
+ the @object. If success, then info contains an instantiated key */
+#if 0
+struct reiser4_crypto_info * create_crypto_info(struct inode * object,
+ struct reiser4_crypto_data * data)
+{
+ int ret;
+ struct reiser4_crypto_info * info;
+
+ assert("edward-1377", data != NULL);
+ assert("edward-1378", need_cipher(object));
+
+ if (inode_file_plugin(object) !=
+ file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
+ return ERR_PTR(-EINVAL);
+
+ info = reiser4_alloc_crypto_info(object);
+ if (IS_ERR(info))
+ return info;
+ ret = alloc_crypto_tfms(info);
+ if (ret)
+ goto err;
+ /* instantiating a key */
+ ret = crypto_blkcipher_setkey(info_get_cipher(info),
+ data->key,
+ data->keysize);
+ if (ret) {
+ warning("edward-1379",
+ "setkey failed flags=%x",
+ crypto_blkcipher_get_flags(info_get_cipher(info)));
+ goto err;
+ }
+ info->keysize = data->keysize;
+ ret = create_keyid(info, data);
+ if (ret)
+ goto err;
+ instantiate_crypto_info(info);
+ return info;
+ err:
+ __free_crypto_info(object);
+ return ERR_PTR(ret);
+}
+#endif
+
+/* increment/decrement a load counter when
+ attaching/detaching the crypto-stat to any object */
+static void load_crypto_info(struct reiser4_crypto_info * info)
+{
+ assert("edward-1380", info != NULL);
+ inc_keyload_count(info);
+}
+
+static void unload_crypto_info(struct inode * inode)
+{
+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
+ assert("edward-1381", info->keyload_count > 0);
+
+ dec_keyload_count(inode_crypto_info(inode));
+ if (info->keyload_count == 0)
+ /* final release */
+ free_crypto_info(inode);
+}
+
+/* attach/detach an existing crypto-stat */
+void reiser4_attach_crypto_info(struct inode * inode,
+ struct reiser4_crypto_info * info)
+{
+ assert("edward-1382", inode != NULL);
+ assert("edward-1383", info != NULL);
+ assert("edward-1384", inode_crypto_info(inode) == NULL);
+
+ set_inode_crypto_info(inode, info);
+ load_crypto_info(info);
+}
+
+/* returns true, if crypto stat can be attached to the @host */
+#if REISER4_DEBUG
+static int host_allows_crypto_info(struct inode * host)
+{
+ int ret;
+ file_plugin * fplug = inode_file_plugin(host);
+
+ switch (fplug->h.id) {
+ case CRYPTCOMPRESS_FILE_PLUGIN_ID:
+ ret = 1;
+ break;
+ default:
+ ret = 0;
+ }
+ return ret;
+}
+#endif /* REISER4_DEBUG */
+
+static void reiser4_detach_crypto_info(struct inode * inode)
+{
+ assert("edward-1385", inode != NULL);
+ assert("edward-1386", host_allows_crypto_info(inode));
+
+ if (inode_crypto_info(inode))
+ unload_crypto_info(inode);
+ set_inode_crypto_info(inode, NULL);
+}
+
+#if 0
+
+/* compare fingerprints of @child and @parent */
+static int keyid_eq(struct reiser4_crypto_info * child,
+ struct reiser4_crypto_info * parent)
+{
+ return !memcmp(child->keyid,
+ parent->keyid,
+ info_digest_plugin(parent)->fipsize);
+}
+
+/* check if a crypto-stat (which is bound to @parent) can be inherited */
+int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
+{
+ if (!need_cipher(child))
+ return 0;
+ /* the child is created */
+ if (!inode_crypto_info(child))
+ return 1;
+ /* the child is looked up */
+ if (!inode_crypto_info(parent))
+ return 0;
+ return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
+ inode_digest_plugin(child) == inode_digest_plugin(parent) &&
+ inode_crypto_info(child)->keysize ==
+ inode_crypto_info(parent)->keysize &&
+ keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
+}
+#endif
+
+/* helper functions for ->create() method of the cryptcompress plugin */
+static int inode_set_crypto(struct inode * object)
+{
+ reiser4_inode * info;
+ if (!inode_crypto_info(object)) {
+ if (need_cipher(object))
+ return RETERR(-EINVAL);
+ /* the file is not to be encrypted */
+ return 0;
+ }
+ info = reiser4_inode_data(object);
+ info->extmask |= (1 << CRYPTO_STAT);
+ return 0;
+}
+
+static int inode_init_compression(struct inode * object)
+{
+ int result = 0;
+ assert("edward-1461", object != NULL);
+ if (inode_compression_plugin(object)->init)
+ result = inode_compression_plugin(object)->init();
+ return result;
+}
+
+static int inode_check_cluster(struct inode * object)
+{
+ assert("edward-696", object != NULL);
+
+ if (unlikely(inode_cluster_size(object) < PAGE_SIZE)) {
+ warning("edward-1320", "Can not support '%s' "
+ "logical clusters (less then page size)",
+ inode_cluster_plugin(object)->h.label);
+ return RETERR(-EINVAL);
+ }
+ if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
+ warning("edward-1463", "Can not support '%s' "
+ "logical clusters (too big for transform)",
+ inode_cluster_plugin(object)->h.label);
+ return RETERR(-EINVAL);
+ }
+ return 0;
+}
+
+/* plugin->destroy_inode() */
+void destroy_inode_cryptcompress(struct inode * inode)
+{
+ assert("edward-1464", INODE_PGCOUNT(inode) == 0);
+ reiser4_detach_crypto_info(inode);
+ return;
+}
+
+/* plugin->create_object():
+. install plugins
+. attach crypto info if specified
+. attach compression info if specified
+. attach cluster info
+*/
+int create_object_cryptcompress(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *data, oid_t *oid)
+{
+ int result;
+ reiser4_inode *info;
+
+ assert("edward-23", object != NULL);
+ assert("edward-24", parent != NULL);
+ assert("edward-30", data != NULL);
+ assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
+ assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
+
+ info = reiser4_inode_data(object);
+
+ assert("edward-29", info != NULL);
+
+ /* set file bit */
+ info->plugin_mask |= (1 << PSET_FILE);
+
+ /* set crypto */
+ result = inode_set_crypto(object);
+ if (result)
+ goto error;
+ /* set compression */
+ result = inode_init_compression(object);
+ if (result)
+ goto error;
+ /* set cluster */
+ result = inode_check_cluster(object);
+ if (result)
+ goto error;
+
+ /* save everything in disk stat-data */
+ result = write_sd_by_inode_common(object, oid);
+ if (!result)
+ return 0;
+ error:
+ reiser4_detach_crypto_info(object);
+ return result;
+}
+
+/* plugin->open() */
+int open_cryptcompress(struct inode * inode, struct file * file)
+{
+ return 0;
+}
+
+#if REISER4_CRYPTO
+/* returns a blocksize, the attribute of a cipher algorithm */
+static unsigned int
+cipher_blocksize(struct inode * inode)
+{
+ assert("edward-758", need_cipher(inode));
+ assert("edward-1400", inode_crypto_info(inode) != NULL);
+ return crypto_blkcipher_blocksize
+ (info_get_cipher(inode_crypto_info(inode)));
+}
+
+/* returns offset translated by scale factor of the crypto-algorithm */
+static loff_t inode_scaled_offset (struct inode * inode,
+ const loff_t src_off /* input offset */)
+{
+ assert("edward-97", inode != NULL);
+
+ if (!need_cipher(inode) ||
+ src_off == get_key_offset(reiser4_min_key()) ||
+ src_off == get_key_offset(reiser4_max_key()))
+ return src_off;
+
+ return inode_cipher_plugin(inode)->scale(inode,
+ cipher_blocksize(inode),
+ src_off);
+}
+#else
+#define inode_scaled_offset(__inode, __off) __off
+#endif
+
+/* returns disk cluster size */
+size_t inode_scaled_cluster_size(struct inode * inode)
+{
+ assert("edward-110", inode != NULL);
+
+ return inode_scaled_offset(inode, inode_cluster_size(inode));
+}
+
+/* set number of cluster pages */
+static void set_cluster_nrpages(struct cluster_handle * clust,
+ struct inode *inode)
+{
+ struct reiser4_slide * win;
+
+ assert("edward-180", clust != NULL);
+ assert("edward-1040", inode != NULL);
+
+ clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
+ win = clust->win;
+ if (!win) {
+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
+ return;
+ }
+ assert("edward-1176", clust->op != LC_INVAL);
+ assert("edward-1064", win->off + win->count + win->delta != 0);
+
+ if (win->stat == HOLE_WINDOW &&
+ win->off == 0 && win->count == inode_cluster_size(inode)) {
+ /* special case: writing a "fake" logical cluster */
+ clust->nr_pages = 0;
+ return;
+ }
+ clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
+ lbytes(clust->index, inode)));
+ return;
+}
+
+/**
+ * build key of a disk cluster (item group)
+ */
+int build_body_key_cryptcompress(struct inode *inode, loff_t off,
+ reiser4_key *key)
+{
+ assert("edward-64", inode != 0);
+
+ if (likely(off != get_key_offset(reiser4_max_key())))
+ off = off_to_clust_to_off(off, inode);
+ if (inode_crypto_info(inode))
+ off = inode_scaled_offset(inode, off);
+
+ build_body_key_unix_file(inode, off, key);
+ return 0;
+}
+
+int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
+ int user, /* 1: @buf is of user space,
+ 0: kernel space */
+ loff_t size, /* @buf size */
+ loff_t off, /* offset to start io from */
+ rw_op op, /* READ or WRITE */
+ flow_t * f /* resulting flow */)
+{
+ assert("edward-436", f != NULL);
+ assert("edward-149", inode != NULL);
+ assert("edward-150", inode_file_plugin(inode) != NULL);
+ assert("edward-1465", user == 0); /* we use flow to read/write
+ disk clusters located in
+ kernel space */
+ f->length = size;
+ memcpy(&f->data, &buf, sizeof(buf));
+ f->user = user;
+ f->op = op;
+ return build_body_key_cryptcompress(inode, off, &f->key);
+}
+
+static int cryptcompress_hint_validate(hint_t *hint, reiser4_tree *tree,
+ const reiser4_key * key,
+ znode_lock_mode lock_mode)
+{
+ coord_t *coord;
+
+ assert("edward-704", hint != NULL);
+ assert("edward-1089", !hint_is_valid(hint));
+ assert("edward-706", hint->lh.owner == NULL);
+
+ coord = &hint->ext_coord.coord;
+
+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
+ /* hint either not set or set by different operation */
+ return RETERR(-E_REPEAT);
+
+ if (get_key_offset(key) != hint->offset)
+ /* hint is set for different key */
+ return RETERR(-E_REPEAT);
+
+ assert("edward-707", reiser4_schedulable());
+
+ return reiser4_seal_validate(&hint->seal, tree, &hint->ext_coord.coord,
+ key, &hint->lh, lock_mode,
+ ZNODE_LOCK_LOPRI);
+}
+
+/* reserve disk space when writing a logical cluster */
+static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
+{
+ int result = 0;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ assert("edward-965", reiser4_schedulable());
+ assert("edward-439", inode != NULL);
+ assert("edward-440", clust != NULL);
+ assert("edward-441", clust->pages != NULL);
+
+ if (clust->nr_pages == 0) {
+ assert("edward-1152", clust->win != NULL);
+ assert("edward-1153", clust->win->stat == HOLE_WINDOW);
+ /* don't reserve disk space for fake logical cluster */
+ return 0;
+ }
+ assert("edward-442", jprivate(clust->pages[0]) != NULL);
+
+ result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
+ estimate_update_cluster(inode),
+ BA_CAN_COMMIT, subv);
+ if (result)
+ return result;
+ clust->reserved = 1;
+ grabbed2cluster_reserved(estimate_insert_cluster(inode) +
+ estimate_update_cluster(inode), subv);
+#if REISER4_DEBUG
+ clust->reserved_prepped = estimate_update_cluster(inode);
+ clust->reserved_unprepped = estimate_insert_cluster(inode);
+#endif
+ /* there can be space grabbed by txnmgr_force_commit_all */
+ return 0;
+}
+
+/* free reserved disk space if writing a logical cluster fails */
+static void free_reserved4cluster(struct inode *inode,
+ struct cluster_handle *ch, int count)
+{
+ assert("edward-967", ch->reserved == 1);
+
+ cluster_reserved2free(count, get_meta_subvol());
+ ch->reserved = 0;
+}
+
+/*
+ * The core search procedure of the cryptcompress plugin.
+ * If returned value is not cbk_errored, then current position
+ * is locked.
+ */
+static int find_cluster_item(hint_t * hint,
+ const reiser4_key * key, /* key of the item we are
+ looking for */
+ znode_lock_mode lock_mode /* which lock */ ,
+ ra_info_t * ra_info, lookup_bias bias, __u32 flags,
+ reiser4_subvol *subv)
+{
+ int result;
+ reiser4_key ikey;
+ coord_t *coord = &hint->ext_coord.coord;
+ coord_t orig = *coord;
+
+ assert("edward-152", hint != NULL);
+
+ if (!hint_is_valid(hint)) {
+ result = cryptcompress_hint_validate(hint, &subv->tree,
+ key, lock_mode);
+ if (result == -E_REPEAT)
+ goto traverse_tree;
+ else if (result) {
+ assert("edward-1216", 0);
+ return result;
+ }
+ hint_set_valid(hint);
+ }
+ assert("edward-709", znode_is_any_locked(coord->node));
+ /*
+ * Hint is valid, so we perform in-place lookup.
+ * It means we just need to check if the next item in
+ * the tree (relative to the current position @coord)
+ * has key @key.
+ *
+ * Valid hint means in particular, that node is not
+ * empty and at least one its item has been processed
+ */
+ if (equal_to_rdk(coord->node, key)) {
+ /*
+ * Look for the item in the right neighbor
+ */
+ lock_handle lh_right;
+
+ init_lh(&lh_right);
+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
+ znode_is_wlocked(coord->node) ?
+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result) {
+ done_lh(&lh_right);
+ reiser4_unset_hint(hint);
+ if (result == -E_NO_NEIGHBOR)
+ return RETERR(-EIO);
+ return result;
+ }
+ assert("edward-1218",
+ equal_to_ldk(lh_right.node, key));
+ result = zload(lh_right.node);
+ if (result) {
+ done_lh(&lh_right);
+ reiser4_unset_hint(hint);
+ return result;
+ }
+ coord_init_first_unit_nocheck(coord, lh_right.node);
+
+ if (!coord_is_existing_item(coord)) {
+ zrelse(lh_right.node);
+ done_lh(&lh_right);
+ goto traverse_tree;
+ }
+ item_key_by_coord(coord, &ikey);
+ zrelse(coord->node);
+ if (unlikely(!keyeq(key, &ikey))) {
+ warning("edward-1608",
+ "Expected item not found. Fsck?");
+ done_lh(&lh_right);
+ goto not_found;
+ }
+ /*
+ * item has been found in the right neighbor;
+ * move lock to the right
+ */
+ done_lh(&hint->lh);
+ move_lh(&hint->lh, &lh_right);
+
+ dclust_inc_extension_ncount(hint);
+
+ return CBK_COORD_FOUND;
+ } else {
+ /*
+ * Look for the item in the current node
+ */
+ coord->item_pos++;
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+
+ result = zload(coord->node);
+ if (result) {
+ done_lh(&hint->lh);
+ return result;
+ }
+ if (!coord_is_existing_item(coord)) {
+ zrelse(coord->node);
+ goto not_found;
+ }
+ item_key_by_coord(coord, &ikey);
+ zrelse(coord->node);
+ if (!keyeq(key, &ikey))
+ goto not_found;
+ /*
+ * item has been found in the current node
+ */
+ dclust_inc_extension_ncount(hint);
+
+ return CBK_COORD_FOUND;
+ }
+ not_found:
+ /*
+ * The tree doesn't contain an item with @key;
+ * roll back the coord
+ */
+ *coord = orig;
+ ON_DEBUG(coord_update_v(coord));
+ return CBK_COORD_NOTFOUND;
+
+ traverse_tree:
+
+ reiser4_unset_hint(hint);
+ dclust_init_extension(hint);
+ coord_init_zero(coord);
+
+ assert("edward-713", hint->lh.owner == NULL);
+ assert("edward-714", reiser4_schedulable());
+
+ result = coord_by_key(&subv->tree,
+ key, coord, &hint->lh,
+ lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
+ CBK_UNIQUE | flags, ra_info);
+ if (cbk_errored(result))
+ return result;
+ if(result == CBK_COORD_FOUND)
+ dclust_inc_extension_ncount(hint);
+ hint_set_valid(hint);
+ return result;
+}
+
+#if REISER4_CRYPTO
+
+/* This function is called by deflate[inflate] manager when
+ creating a transformed/plain stream to check if we should
+ create/cut some overhead. If this returns true, then @oh
+ contains the size of this overhead.
+ */
+static int need_cut_or_align(struct inode * inode,
+ struct cluster_handle * ch, rw_op rw, int * oh)
+{
+ struct tfm_cluster * tc = &ch->tc;
+ switch (rw) {
+ case WRITE_OP: /* estimate align */
+ *oh = tc->len % cipher_blocksize(inode);
+ if (*oh != 0)
+ return 1;
+ break;
+ case READ_OP: /* estimate cut */
+ *oh = *(tfm_output_data(ch) + tc->len - 1);
+ break;
+ default:
+ impossible("edward-1401", "bad option");
+ }
+ return (tc->len != tc->lsize);
+}
+
+/* create/cut an overhead of transformed/plain stream */
+static void align_or_cut_overhead(struct inode * inode,
+ struct cluster_handle * ch, rw_op rw)
+{
+ unsigned int oh;
+ cipher_plugin * cplug = inode_cipher_plugin(inode);
+
+ assert("edward-1402", need_cipher(inode));
+
+ if (!need_cut_or_align(inode, ch, rw, &oh))
+ return;
+ switch (rw) {
+ case WRITE_OP: /* do align */
+ ch->tc.len +=
+ cplug->align_stream(tfm_input_data(ch) +
+ ch->tc.len, ch->tc.len,
+ cipher_blocksize(inode));
+ *(tfm_input_data(ch) + ch->tc.len - 1) =
+ cipher_blocksize(inode) - oh;
+ break;
+ case READ_OP: /* do cut */
+ assert("edward-1403", oh <= cipher_blocksize(inode));
+ ch->tc.len -= oh;
+ break;
+ default:
+ impossible("edward-1404", "bad option");
+ }
+ return;
+}
+
+static unsigned max_cipher_overhead(struct inode * inode)
+{
+ if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
+ return 0;
+ return cipher_blocksize(inode);
+}
+#else
+#define max_cipher_overhead(_inode) 0
+#endif
+
+static int deflate_overhead(struct inode *inode)
+{
+ return (inode_compression_plugin(inode)->
+ checksum ? DC_CHECKSUM_SIZE : 0);
+}
+
+static unsigned deflate_overrun(struct inode * inode, int ilen)
+{
+ return coa_overrun(inode_compression_plugin(inode), ilen);
+}
+
+static bool is_all_zero(char const* mem, size_t size)
+{
+ while (size-- > 0)
+ if (*mem++)
+ return false;
+ return true;
+}
+
+static inline bool should_punch_hole(struct tfm_cluster *tc)
+{
+ if (0 &&
+ !reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_PUNCH_HOLES)
+ && is_all_zero(tfm_stream_data(tc, INPUT_STREAM), tc->lsize)) {
+
+ tc->hole = 1;
+ return true;
+ }
+ return false;
+}
+
+/* Estimating compressibility of a logical cluster by various
+ policies represented by compression mode plugin.
+ If this returns false, then compressor won't be called for
+ the cluster of index @index.
+*/
+static int should_compress(struct tfm_cluster *tc, cloff_t index,
+ struct inode *inode)
+{
+ compression_plugin *cplug = inode_compression_plugin(inode);
+ compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
+
+ assert("edward-1321", tc->len != 0);
+ assert("edward-1322", cplug != NULL);
+ assert("edward-1323", mplug != NULL);
+
+ if (should_punch_hole(tc))
+ /*
+ * we are about to punch a hole,
+ * so don't compress data
+ */
+ return 0;
+ return /* estimate by size */
+ (cplug->min_size_deflate ?
+ tc->len >= cplug->min_size_deflate() :
+ 1) &&
+ /* estimate by compression mode plugin */
+ (mplug->should_deflate ?
+ mplug->should_deflate(inode, index) :
+ 1);
+}
+
+/* Evaluating results of compression transform.
+ Returns true, if we need to accept this results */
+static int save_compressed(int size_before, int size_after, struct inode *inode)
+{
+ return (size_after + deflate_overhead(inode) +
+ max_cipher_overhead(inode) < size_before);
+}
+
+/* Guess result of the evaluation above */
+static int need_inflate(struct cluster_handle * ch, struct inode * inode,
+ int encrypted /* is cluster encrypted */ )
+{
+ struct tfm_cluster * tc = &ch->tc;
+
+ assert("edward-142", tc != 0);
+ assert("edward-143", inode != NULL);
+
+ return tc->len <
+ (encrypted ?
+ inode_scaled_offset(inode, tc->lsize) :
+ tc->lsize);
+}
+
+/* If results of compression were accepted, then we add
+ a checksum to catch possible disk cluster corruption.
+ The following is a format of the data stored in disk clusters:
+
+ data This is (transformed) logical cluster.
+ cipher_overhead This is created by ->align() method
+ of cipher plugin. May be absent.
+ checksum (4) This is created by ->checksum method
+ of compression plugin to check
+ integrity. May be absent.
+
+ Crypto overhead format:
+
+ data
+ control_byte (1) contains aligned overhead size:
+ 1 <= overhead <= cipher_blksize
+*/
+/* Append a checksum at the end of a transformed stream */
+static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
+{
+ __u32 checksum;
+
+ assert("edward-1309", tc != NULL);
+ assert("edward-1310", tc->len > 0);
+ assert("edward-1311", cplug->checksum != NULL);
+
+ checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
+ put_unaligned(cpu_to_le32(checksum),
+ (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
+ tc->len += (int)DC_CHECKSUM_SIZE;
+}
+
+/* Check a disk cluster checksum.
+ Returns 0 if checksum is correct, otherwise returns 1 */
+static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
+{
+ assert("edward-1312", tc != NULL);
+ assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
+ assert("edward-1314", cplug->checksum != NULL);
+
+ if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
+ tc->len - (int)DC_CHECKSUM_SIZE) !=
+ le32_to_cpu(get_unaligned((d32 *)
+ (tfm_stream_data(tc, INPUT_STREAM)
+ + tc->len - (int)DC_CHECKSUM_SIZE)))) {
+ warning("edward-156",
+ "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
+ (int)le32_to_cpu
+ (get_unaligned((d32 *)
+ (tfm_stream_data(tc, INPUT_STREAM) +
+ tc->len - (int)DC_CHECKSUM_SIZE))),
+ (int)cplug->checksum
+ (tfm_stream_data(tc, INPUT_STREAM),
+ tc->len - (int)DC_CHECKSUM_SIZE));
+ return 1;
+ }
+ tc->len -= (int)DC_CHECKSUM_SIZE;
+ return 0;
+}
+
+/* get input/output stream for some transform action */
+int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
+ tfm_stream_id id)
+{
+ size_t size = inode_scaled_cluster_size(inode);
+
+ assert("edward-901", tc != NULL);
+ assert("edward-1027", inode_compression_plugin(inode) != NULL);
+
+ if (cluster_get_tfm_act(tc) == TFMA_WRITE)
+ size += deflate_overrun(inode, inode_cluster_size(inode));
+
+ if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
+ alternate_streams(tc);
+ if (!get_tfm_stream(tc, id))
+ return alloc_tfm_stream(tc, size, id);
+
+ assert("edward-902", tfm_stream_is_set(tc, id));
+
+ if (tfm_stream_size(tc, id) < size)
+ return realloc_tfm_stream(tc, size, id);
+ return 0;
+}
+
+/* Common deflate manager */
+int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
+{
+ int result = 0;
+ int compressed = 0;
+ int encrypted = 0;
+ struct tfm_cluster * tc = &clust->tc;
+ compression_plugin * coplug;
+
+ assert("edward-401", inode != NULL);
+ assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
+ assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
+ assert("edward-498", !tfm_cluster_is_uptodate(tc));
+
+ coplug = inode_compression_plugin(inode);
+ if (should_compress(tc, clust->index, inode)) {
+ /* try to compress, discard bad results */
+ size_t dst_len;
+ compression_mode_plugin * mplug =
+ inode_compression_mode_plugin(inode);
+ assert("edward-602", coplug != NULL);
+ assert("edward-1423", coplug->compress != NULL);
+
+ result = grab_coa(tc, coplug);
+ if (result)
+ /*
+ * can not allocate memory to perform
+ * compression, leave data uncompressed
+ */
+ goto cipher;
+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
+ if (result) {
+ warning("edward-1425",
+ "alloc stream failed with ret=%d, skipped compression",
+ result);
+ goto cipher;
+ }
+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
+ coplug->compress(get_coa(tc, coplug->h.id, tc->act),
+ tfm_input_data(clust), tc->len,
+ tfm_output_data(clust), &dst_len);
+ /* make sure we didn't overwrite extra bytes */
+ assert("edward-603",
+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
+
+ /* evaluate results of compression transform */
+ if (save_compressed(tc->len, dst_len, inode)) {
+ /* good result, accept */
+ tc->len = dst_len;
+ if (mplug->accept_hook != NULL) {
+ result = mplug->accept_hook(inode, clust->index);
+ if (result)
+ warning("edward-1426",
+ "accept_hook failed with ret=%d",
+ result);
+ }
+ compressed = 1;
+ }
+ else {
+ /* bad result, discard */
+#if 0
+ if (cluster_is_complete(clust, inode))
+ warning("edward-1496",
+ "incompressible cluster %lu (inode %llu)",
+ clust->index,
+ (unsigned long long)get_inode_oid(inode));
+#endif
+ if (mplug->discard_hook != NULL &&
+ cluster_is_complete(clust, inode)) {
+ result = mplug->discard_hook(inode,
+ clust->index);
+ if (result)
+ warning("edward-1427",
+ "discard_hook failed with ret=%d",
+ result);
+ }
+ }
+ }
+ cipher:
+#if REISER4_CRYPTO
+ if (need_cipher(inode)) {
+ cipher_plugin * ciplug;
+ struct blkcipher_desc desc;
+ struct scatterlist src;
+ struct scatterlist dst;
+
+ ciplug = inode_cipher_plugin(inode);
+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
+ desc.flags = 0;
+ if (compressed)
+ alternate_streams(tc);
+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
+ if (result)
+ return result;
+
+ align_or_cut_overhead(inode, clust, WRITE_OP);
+ sg_init_one(&src, tfm_input_data(clust), tc->len);
+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
+
+ result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
+ if (result) {
+ warning("edward-1405",
+ "encryption failed flags=%x\n", desc.flags);
+ return result;
+ }
+ encrypted = 1;
+ }
+#endif
+ if (compressed && coplug->checksum != NULL)
+ dc_set_checksum(coplug, tc);
+ if (!compressed && !encrypted)
+ alternate_streams(tc);
+ return result;
+}
+
+/* Common inflate manager. */
+int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
+{
+ int result = 0;
+ int transformed = 0;
+ struct tfm_cluster * tc = &clust->tc;
+ compression_plugin * coplug;
+
+ assert("edward-905", inode != NULL);
+ assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
+ assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
+ assert("edward-1349", tc->act == TFMA_READ);
+ assert("edward-907", !tfm_cluster_is_uptodate(tc));
+
+ /* Handle a checksum (if any) */
+ coplug = inode_compression_plugin(inode);
+ if (need_inflate(clust, inode, need_cipher(inode)) &&
+ coplug->checksum != NULL) {
+ result = dc_check_checksum(coplug, tc);
+ if (unlikely(result)) {
+ warning("edward-1460",
+ "Inode %llu: disk cluster %lu looks corrupted",
+ (unsigned long long)get_inode_oid(inode),
+ clust->index);
+ return RETERR(-EIO);
+ }
+ }
+#if REISER4_CRYPTO
+ if (need_cipher(inode)) {
+ cipher_plugin * ciplug;
+ struct blkcipher_desc desc;
+ struct scatterlist src;
+ struct scatterlist dst;
+
+ ciplug = inode_cipher_plugin(inode);
+ desc.tfm = info_get_cipher(inode_crypto_info(inode));
+ desc.flags = 0;
+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
+ if (result)
+ return result;
+ assert("edward-909", tfm_cluster_is_set(tc));
+
+ sg_init_one(&src, tfm_input_data(clust), tc->len);
+ sg_init_one(&dst, tfm_output_data(clust), tc->len);
+
+ result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
+ if (result) {
+ warning("edward-1600", "decrypt failed flags=%x\n",
+ desc.flags);
+ return result;
+ }
+ align_or_cut_overhead(inode, clust, READ_OP);
+ transformed = 1;
+ }
+#endif
+ if (need_inflate(clust, inode, 0)) {
+ size_t dst_len = inode_cluster_size(inode);
+ if(transformed)
+ alternate_streams(tc);
+
+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
+ if (result)
+ return result;
+ assert("edward-1305", coplug->decompress != NULL);
+ assert("edward-910", tfm_cluster_is_set(tc));
+
+ coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
+ tfm_input_data(clust), tc->len,
+ tfm_output_data(clust), &dst_len);
+ /* check length */
+ tc->len = dst_len;
+ assert("edward-157", dst_len == tc->lsize);
+ transformed = 1;
+ }
+ if (!transformed)
+ alternate_streams(tc);
+ return result;
+}
+
+/* This is implementation of readpage method of struct
+ address_space_operations for cryptcompress plugin. */
+int readpage_cryptcompress(struct file *file, struct page *page)
+{
+ reiser4_context *ctx;
+ struct cluster_handle clust;
+ int result;
+
+ assert("edward-88", PageLocked(page));
+ assert("vs-976", !PageUptodate(page));
+ assert("edward-89", page->mapping && page->mapping->host);
+
+ ctx = reiser4_init_context(page->mapping->host->i_sb);
+ if (IS_ERR(ctx)) {
+ unlock_page(page);
+ return PTR_ERR(ctx);
+ }
+ assert("edward-113",
+ ergo(file != NULL,
+ page->mapping == file_inode(file)->i_mapping));
+
+ if (PageUptodate(page)) {
+ warning("edward-1338", "page is already uptodate\n");
+ unlock_page(page);
+ reiser4_exit_context(ctx);
+ return 0;
+ }
+ cluster_init_read(&clust, NULL);
+ clust.file = file;
+
+ result = readpage_ctail(&clust, page);
+
+ put_cluster_handle(&clust);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/* number of pages to check in */
+static int get_new_nrpages(struct cluster_handle * clust)
+{
+ switch (clust->op) {
+ case LC_APPOV:
+ case LC_EXPAND:
+ return clust->nr_pages;
+ case LC_SHRINK:
+ assert("edward-1179", clust->win != NULL);
+ return size_in_pages(clust->win->off + clust->win->count);
+ default:
+ impossible("edward-1180", "bad page cluster option");
+ return 0;
+ }
+}
+
+static void set_cluster_pages_dirty(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ int i;
+ struct page *pg;
+ int nrpages = get_new_nrpages(clust);
+
+ for (i = 0; i < nrpages; i++) {
+
+ pg = clust->pages[i];
+ assert("edward-968", pg != NULL);
+ lock_page(pg);
+ assert("edward-1065", PageUptodate(pg));
+ set_page_dirty_notag(pg);
+ unlock_page(pg);
+ mark_page_accessed(pg);
+ }
+}
+
+/* Grab a page cluster for read/write operations.
+ Attach a jnode for write operations (when preparing for modifications, which
+ are supposed to be committed).
+
+ We allocate only one jnode per page cluster; this jnode is binded to the
+ first page of this cluster, so we have an extra-reference that will be put
+ as soon as jnode is evicted from memory), other references will be cleaned
+ up in flush time (assume that check in page cluster was successful).
+*/
+int grab_page_cluster(struct inode * inode,
+ struct cluster_handle * clust, rw_op rw)
+{
+ int i;
+ int result = 0;
+ jnode *node = NULL;
+
+ assert("edward-182", clust != NULL);
+ assert("edward-183", clust->pages != NULL);
+ assert("edward-1466", clust->node == NULL);
+ assert("edward-1428", inode != NULL);
+ assert("edward-1429", inode->i_mapping != NULL);
+ assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
+
+ if (clust->nr_pages == 0)
+ return 0;
+
+ for (i = 0; i < clust->nr_pages; i++) {
+
+ assert("edward-1044", clust->pages[i] == NULL);
+
+ clust->pages[i] =
+ find_or_create_page(inode->i_mapping,
+ clust_to_pg(clust->index, inode) + i,
+ reiser4_ctx_gfp_mask_get());
+ if (!clust->pages[i]) {
+ result = RETERR(-ENOMEM);
+ break;
+ }
+ if (i == 0 && rw == WRITE_OP) {
+ node = jnode_of_page(clust->pages[i]);
+ if (IS_ERR(node)) {
+ result = PTR_ERR(node);
+ unlock_page(clust->pages[i]);
+ break;
+ }
+ JF_SET(node, JNODE_CLUSTER_PAGE);
+ assert("edward-920", jprivate(clust->pages[0]));
+ /*
+ * this jnode doesn't participate in IO -
+ * we need it only to track transactions.
+ * We'll make it dirty and respectively
+ * need to reserve disk space for that.
+ * By design we reserve space on meta-data
+ * subvolume. So we set meta-data subvolume
+ * for reservation issues.
+ */
+ if (node->subvol == NULL)
+ node->subvol = get_meta_subvol();
+ else
+ assert("edward-2225",
+ node->subvol == get_meta_subvol());
+ }
+ INODE_PGCOUNT_INC(inode);
+ unlock_page(clust->pages[i]);
+ }
+ if (unlikely(result)) {
+ while (i) {
+ put_cluster_page(clust->pages[--i]);
+ INODE_PGCOUNT_DEC(inode);
+ }
+ if (node && !IS_ERR(node))
+ jput(node);
+ return result;
+ }
+ clust->node = node;
+ return 0;
+}
+
+static void truncate_page_cluster_range(struct inode * inode,
+ struct page ** pages,
+ cloff_t index,
+ int from, int count,
+ int even_cows)
+{
+ assert("edward-1467", count > 0);
+ reiser4_invalidate_pages(inode->i_mapping,
+ clust_to_pg(index, inode) + from,
+ count, even_cows);
+}
+
+/* Put @count pages starting from @from offset */
+void __put_page_cluster(int from, int count,
+ struct page ** pages, struct inode * inode)
+{
+ int i;
+ assert("edward-1468", pages != NULL);
+ assert("edward-1469", inode != NULL);
+ assert("edward-1470", from >= 0 && count >= 0);
+
+ for (i = 0; i < count; i++) {
+ assert("edward-1471", pages[from + i] != NULL);
+ assert("edward-1472",
+ pages[from + i]->index == pages[from]->index + i);
+
+ put_cluster_page(pages[from + i]);
+ INODE_PGCOUNT_DEC(inode);
+ }
+}
+
+/*
+ * This is dual to grab_page_cluster,
+ * however if @rw == WRITE_OP, then we call this function
+ * only if something is failed before checkin page cluster.
+ */
+void put_page_cluster(struct cluster_handle * clust,
+ struct inode * inode, rw_op rw)
+{
+ assert("edward-445", clust != NULL);
+ assert("edward-922", clust->pages != NULL);
+ assert("edward-446",
+ ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
+
+ __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
+ if (rw == WRITE_OP) {
+ if (unlikely(clust->node)) {
+ assert("edward-447",
+ clust->node == jprivate(clust->pages[0]));
+ jput(clust->node);
+ clust->node = NULL;
+ }
+ }
+}
+
+#if REISER4_DEBUG
+int cryptcompress_inode_ok(struct inode *inode)
+{
+ if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
+ return 0;
+ if (!cluster_shift_ok(inode_cluster_shift(inode)))
+ return 0;
+ return 1;
+}
+
+static int window_ok(struct reiser4_slide * win, struct inode *inode)
+{
+ assert("edward-1115", win != NULL);
+ assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
+
+ return (win->off != inode_cluster_size(inode)) &&
+ (win->off + win->count + win->delta <= inode_cluster_size(inode));
+}
+
+static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
+{
+ assert("edward-279", clust != NULL);
+
+ if (!clust->pages)
+ return 0;
+ return (clust->win ? window_ok(clust->win, inode) : 1);
+}
+#if 0
+static int pages_truncate_ok(struct inode *inode, pgoff_t start)
+{
+ int found;
+ struct page * page;
+
+ found = find_get_pages(inode->i_mapping, &start, 1, &page);
+ if (found)
+ put_cluster_page(page);
+ return !found;
+}
+#else
+#define pages_truncate_ok(inode, start) 1
+#endif
+
+static int jnode_truncate_ok(struct inode *inode, cloff_t index)
+{
+ jnode *node;
+ node = jlookup(get_inode_oid(inode), clust_to_pg(index, inode));
+ if (likely(!node))
+ return 1;
+ jput(node);
+ return 0;
+}
+#endif
+
+/* guess next window stat */
+static inline window_stat next_window_stat(struct reiser4_slide * win)
+{
+ assert("edward-1130", win != NULL);
+ return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
+ HOLE_WINDOW : DATA_WINDOW);
+}
+
+/* guess and set next cluster index and window params */
+static void move_update_window(struct inode * inode,
+ struct cluster_handle * clust,
+ loff_t file_off, loff_t to_file)
+{
+ struct reiser4_slide * win;
+
+ assert("edward-185", clust != NULL);
+ assert("edward-438", clust->pages != NULL);
+ assert("edward-281", cluster_ok(clust, inode));
+
+ win = clust->win;
+ if (!win)
+ return;
+
+ switch (win->stat) {
+ case DATA_WINDOW:
+ /* increment */
+ clust->index++;
+ win->stat = DATA_WINDOW;
+ win->off = 0;
+ win->count = min((loff_t)inode_cluster_size(inode), to_file);
+ break;
+ case HOLE_WINDOW:
+ switch (next_window_stat(win)) {
+ case HOLE_WINDOW:
+ /* skip */
+ clust->index = off_to_clust(file_off, inode);
+ win->stat = HOLE_WINDOW;
+ win->off = 0;
+ win->count = off_to_cloff(file_off, inode);
+ win->delta = min((loff_t)(inode_cluster_size(inode) -
+ win->count), to_file);
+ break;
+ case DATA_WINDOW:
+ /* stay */
+ win->stat = DATA_WINDOW;
+ /* off+count+delta=inv */
+ win->off = win->off + win->count;
+ win->count = win->delta;
+ win->delta = 0;
+ break;
+ default:
+ impossible("edward-282", "wrong next window state");
+ }
+ break;
+ default:
+ impossible("edward-283", "wrong current window state");
+ }
+ assert("edward-1068", cluster_ok(clust, inode));
+}
+
+static int update_sd_cryptcompress(struct inode *inode)
+{
+ int result = 0;
+
+ assert("edward-978", reiser4_schedulable());
+ /*
+ * Reserve space for stat-data update
+ */
+ result = reiser4_grab_space_force(estimate_update_common(inode),
+ BA_CAN_COMMIT,
+ get_meta_subvol());
+ if (result)
+ return result;
+ if (!IS_NOCMTIME(inode))
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+
+ result = reiser4_update_sd(inode);
+
+ if (unlikely(result != 0))
+ warning("edward-1573",
+ "Can not update stat-data: %i. FSCK?",
+ result);
+ return result;
+}
+
+static void uncapture_cluster_jnode(jnode * node)
+{
+ txn_atom *atom;
+
+ assert_spin_locked(&(node->guard));
+
+ atom = jnode_get_atom(node);
+ if (atom == NULL) {
+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
+ spin_unlock_jnode(node);
+ return;
+ }
+ reiser4_uncapture_block(node);
+ spin_unlock_atom(atom);
+ jput(node);
+}
+
+static void put_found_pages(struct page **pages, int nr)
+{
+ int i;
+ for (i = 0; i < nr; i++) {
+ assert("edward-1045", pages[i] != NULL);
+ put_cluster_page(pages[i]);
+ }
+}
+
+/* Lifecycle of a logical cluster in the system.
+ *
+ *
+ * Logical cluster of a cryptcompress file is represented in the system by
+ * . page cluster (in memory, primary cache, contains plain text);
+ * . disk cluster (in memory, secondary cache, contains transformed text).
+ * Primary cache is to reduce number of transform operations (compression,
+ * encryption), i.e. to implement transform-caching strategy.
+ * Secondary cache is to reduce number of I/O operations, i.e. for usual
+ * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
+ * a logical cluster to the primary cache. Disk cluster is a set of items
+ * of the same type defined by some reiser4 item plugin id.
+ *
+ * 1. Performing modifications
+ *
+ * Every modification of a cryptcompress file is considered as a set of
+ * operations performed on file's logical clusters. Every such "atomic"
+ * modification is truncate, append and(or) overwrite some bytes of a
+ * logical cluster performed in the primary cache with the following
+ * synchronization with the secondary cache (in flush time). Disk clusters,
+ * which live in the secondary cache, are supposed to be synchronized with
+ * disk. The mechanism of synchronization of primary and secondary caches
+ * includes so-called checkin/checkout technique described below.
+ *
+ * 2. Submitting modifications
+ *
+ * Each page cluster has associated jnode (a special in-memory header to
+ * keep a track of transactions in reiser4), which is attached to its first
+ * page when grabbing page cluster for modifications (see grab_page_cluster).
+ * Submitting modifications (see checkin_logical_cluster) is going per logical
+ * cluster and includes:
+ * . checkin_cluster_size;
+ * . checkin_page_cluster.
+ * checkin_cluster_size() is resolved to file size update (which completely
+ * defines new size of logical cluster (number of file's bytes in a logical
+ * cluster).
+ * checkin_page_cluster() captures jnode of a page cluster and installs
+ * jnode's dirty flag (if needed) to indicate that modifications are
+ * successfully checked in.
+ *
+ * 3. Checking out modifications
+ *
+ * Is going per logical cluster in flush time (see checkout_logical_cluster).
+ * This is the time of synchronizing primary and secondary caches.
+ * checkout_logical_cluster() includes:
+ * . checkout_page_cluster (retrieving checked in pages).
+ * . uncapture jnode (including clear dirty flag and unlock)
+ *
+ * 4. Committing modifications
+ *
+ * Proceeding a synchronization of primary and secondary caches. When checking
+ * out page cluster (the phase above) pages are locked/flushed/unlocked
+ * one-by-one in ascending order of their indexes to contiguous stream, which
+ * is supposed to be transformed (compressed, encrypted), chopped up into items
+ * and committed to disk as a disk cluster.
+ *
+ * 5. Managing page references
+ *
+ * Every checked in page have a special additional "control" reference,
+ * which is dropped at checkout. We need this to avoid unexpected evicting
+ * pages from memory before checkout. Control references are managed so
+ * they are not accumulated with every checkin:
+ *
+ * 0
+ * checkin -> 1
+ * 0 -> checkout
+ * checkin -> 1
+ * checkin -> 1
+ * checkin -> 1
+ * 0 -> checkout
+ * ...
+ *
+ * Every page cluster has its own unique "cluster lock". Update/drop
+ * references are serialized via this lock. Number of checked in cluster
+ * pages is calculated by i_size under cluster lock. File size is updated
+ * at every checkin action also under cluster lock (except cases of
+ * appending/truncating fake logical clusters).
+ *
+ * Proof of correctness:
+ *
+ * Since we update file size under cluster lock, in the case of non-fake
+ * logical cluster with its lock held we do have expected number of checked
+ * in pages. On the other hand, append/truncate of fake logical clusters
+ * doesn't change number of checked in pages of any cluster.
+ *
+ * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
+ * Currently, I don't see any reason to create a special lock for those
+ * needs.
+ */
+
+static inline void lock_cluster(jnode * node)
+{
+ spin_lock_jnode(node);
+}
+
+static inline void unlock_cluster(jnode * node)
+{
+ spin_unlock_jnode(node);
+}
+
+static inline void unlock_cluster_uncapture(jnode * node)
+{
+ uncapture_cluster_jnode(node);
+}
+
+/* Set new file size by window. Cluster lock is required. */
+static void checkin_file_size(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ loff_t new_size;
+ struct reiser4_slide * win;
+
+ assert("edward-1181", clust != NULL);
+ assert("edward-1182", inode != NULL);
+ assert("edward-1473", clust->pages != NULL);
+ assert("edward-1474", clust->pages[0] != NULL);
+ assert("edward-1475", jprivate(clust->pages[0]) != NULL);
+ assert_spin_locked(&(jprivate(clust->pages[0])->guard));
+
+
+ win = clust->win;
+ assert("edward-1183", win != NULL);
+
+ new_size = clust_to_off(clust->index, inode) + win->off;
+
+ switch (clust->op) {
+ case LC_APPOV:
+ case LC_EXPAND:
+ if (new_size + win->count <= i_size_read(inode))
+ /* overwrite only */
+ return;
+ new_size += win->count;
+ break;
+ case LC_SHRINK:
+ break;
+ default:
+ impossible("edward-1184", "bad page cluster option");
+ break;
+ }
+ inode_check_scale_nolock(inode, i_size_read(inode), new_size);
+ i_size_write(inode, new_size);
+ return;
+}
+
+static inline void checkin_cluster_size(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ if (clust->win)
+ checkin_file_size(clust, inode);
+}
+
+static int checkin_page_cluster(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ int result;
+ jnode * node;
+ int old_nrpages = clust->old_nrpages;
+ int new_nrpages = get_new_nrpages(clust);
+
+ node = clust->node;
+
+ assert("edward-221", node != NULL);
+ assert("edward-971", clust->reserved == 1);
+ assert("edward-1263",
+ clust->reserved_prepped == estimate_update_cluster(inode));
+ assert("edward-1264", clust->reserved_unprepped == 0);
+
+ if (JF_ISSET(node, JNODE_DIRTY)) {
+ /*
+ * page cluster was checked in, but not yet
+ * checked out, so release related resources
+ */
+ free_reserved4cluster(inode, clust,
+ estimate_update_cluster(inode));
+ __put_page_cluster(0, clust->old_nrpages,
+ clust->pages, inode);
+ } else {
+ result = capture_cluster_jnode(node);
+ if (unlikely(result)) {
+ unlock_cluster(node);
+ return result;
+ }
+ jnode_make_dirty_locked(node);
+ clust->reserved = 0;
+ }
+ unlock_cluster(node);
+
+ if (new_nrpages < old_nrpages) {
+ /* truncate >= 1 complete pages */
+ __put_page_cluster(new_nrpages,
+ old_nrpages - new_nrpages,
+ clust->pages, inode);
+ truncate_page_cluster_range(inode,
+ clust->pages, clust->index,
+ new_nrpages,
+ old_nrpages - new_nrpages,
+ 0);
+ }
+#if REISER4_DEBUG
+ clust->reserved_prepped -= estimate_update_cluster(inode);
+#endif
+ return 0;
+}
+
+/* Submit modifications of a logical cluster */
+static int checkin_logical_cluster(struct cluster_handle * clust,
+ struct inode *inode)
+{
+ int result = 0;
+ jnode * node;
+
+ node = clust->node;
+
+ assert("edward-1035", node != NULL);
+ assert("edward-1029", clust != NULL);
+ assert("edward-1030", clust->reserved == 1);
+ assert("edward-1031", clust->nr_pages != 0);
+ assert("edward-1032", clust->pages != NULL);
+ assert("edward-1033", clust->pages[0] != NULL);
+ assert("edward-1446", jnode_is_cluster_page(node));
+ assert("edward-1476", node == jprivate(clust->pages[0]));
+
+ lock_cluster(node);
+ checkin_cluster_size(clust, inode);
+ /*
+ * this will unlock the cluster
+ */
+ result = checkin_page_cluster(clust, inode);
+ jput(node);
+ clust->node = NULL;
+ return result;
+}
+
+/*
+ * Retrieve size of logical cluster that was checked in at
+ * the latest modifying session (cluster lock is required)
+ */
+static inline void checkout_cluster_size(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ struct tfm_cluster *tc = &clust->tc;
+
+ tc->len = lbytes(clust->index, inode);
+ assert("edward-1478", tc->len != 0);
+}
+
+/*
+ * Retrieve a page cluster with the latest submitted modifications
+ * and flush its pages to previously allocated contiguous stream.
+ */
+static void checkout_page_cluster(struct cluster_handle * clust,
+ jnode * node, struct inode * inode)
+{
+ int i;
+ int found;
+ int to_put;
+ pgoff_t page_index = clust_to_pg(clust->index, inode);
+ struct tfm_cluster *tc = &clust->tc;
+
+ /* find and put checked in pages: cluster is locked,
+ * so we must get expected number (to_put) of pages
+ */
+ to_put = size_in_pages(lbytes(clust->index, inode));
+ found = find_get_pages(inode->i_mapping, &page_index,
+ to_put, clust->pages);
+ BUG_ON(found != to_put);
+
+ __put_page_cluster(0, to_put, clust->pages, inode);
+ unlock_cluster_uncapture(node);
+
+ /* Flush found pages.
+ *
+ * Note, that we don't disable modifications while flushing,
+ * moreover, some found pages can be truncated, as we have
+ * released cluster lock.
+ */
+ for (i = 0; i < found; i++) {
+ int in_page;
+ char * data;
+ assert("edward-1479",
+ clust->pages[i]->index == clust->pages[0]->index + i);
+
+ lock_page(clust->pages[i]);
+ if (!PageUptodate(clust->pages[i])) {
+ /* page was truncated */
+ assert("edward-1480",
+ i_size_read(inode) <= page_offset(clust->pages[i]));
+ assert("edward-1481",
+ clust->pages[i]->mapping != inode->i_mapping);
+ unlock_page(clust->pages[i]);
+ break;
+ }
+ /* Update the number of bytes in the logical cluster,
+ * as it could be partially truncated. Note, that only
+ * partial truncate is possible (complete truncate can
+ * not go here, as it is performed via ->kill_hook()
+ * called by cut_file_items(), and the last one must
+ * wait for znode locked with parent coord).
+ */
+ checkout_cluster_size(clust, inode);
+
+ /* this can be zero, as new file size is
+ checked in before truncating pages */
+ in_page = __mbp(tc->len, i);
+
+ data = kmap_atomic(clust->pages[i]);
+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
+ data, in_page);
+ kunmap_atomic(data);
+ /*
+ * modifications have been checked out and will be
+ * committed later. Anyway, the dirty status of the
+ * page is no longer relevant. However, the uptodate
+ * status of the page is still relevant!
+ */
+ if (PageDirty(clust->pages[i]))
+ cancel_dirty_page(clust->pages[i]);
+
+ unlock_page(clust->pages[i]);
+
+ if (in_page < PAGE_SIZE)
+ /* end of the file */
+ break;
+ }
+ put_found_pages(clust->pages, found); /* find_get_pages */
+ tc->lsize = tc->len;
+ return;
+}
+
+/* Check out modifications of a logical cluster */
+int checkout_logical_cluster(struct cluster_handle * clust,
+ jnode * node, struct inode *inode)
+{
+ int result;
+ struct tfm_cluster *tc = &clust->tc;
+
+ assert("edward-980", node != NULL);
+ assert("edward-236", inode != NULL);
+ assert("edward-237", clust != NULL);
+ assert("edward-240", !clust->win);
+ assert("edward-241", reiser4_schedulable());
+ assert("edward-718", cryptcompress_inode_ok(inode));
+
+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
+ if (result) {
+ warning("edward-1430", "alloc stream failed with ret=%d",
+ result);
+ return RETERR(-E_REPEAT);
+ }
+ lock_cluster(node);
+
+ if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
+ /* race with another flush */
+ warning("edward-982",
+ "checking out logical cluster %lu of inode %llu: "
+ "jnode is not dirty", clust->index,
+ (unsigned long long)get_inode_oid(inode));
+ unlock_cluster(node);
+ return RETERR(-E_REPEAT);
+ }
+ cluster_reserved2grabbed(estimate_update_cluster(inode),
+ get_meta_subvol());
+
+ /* this will unlock cluster */
+ checkout_page_cluster(clust, node, inode);
+ return 0;
+}
+
+/* set hint for the cluster of the index @index */
+static void set_hint_cluster(struct inode *inode, hint_t * hint,
+ cloff_t index, znode_lock_mode mode)
+{
+ reiser4_key key;
+ assert("edward-722", cryptcompress_inode_ok(inode));
+ assert("edward-723",
+ inode_file_plugin(inode) ==
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
+
+ build_body_key_cryptcompress(inode,
+ clust_to_off(index, inode),
+ &key);
+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
+ hint->offset = get_key_offset(&key);
+ hint->mode = mode;
+}
+
+void invalidate_hint_cluster(struct cluster_handle * clust)
+{
+ assert("edward-1291", clust != NULL);
+ assert("edward-1292", clust->hint != NULL);
+
+ done_lh(&clust->hint->lh);
+ hint_clr_valid(clust->hint);
+}
+
+static void put_hint_cluster(struct cluster_handle * clust,
+ struct inode *inode, znode_lock_mode mode)
+{
+ assert("edward-1286", clust != NULL);
+ assert("edward-1287", clust->hint != NULL);
+
+ set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
+ invalidate_hint_cluster(clust);
+}
+
+static int balance_dirty_page_cluster(struct cluster_handle * clust,
+ struct inode *inode, loff_t off,
+ loff_t to_file,
+ int nr_dirtied)
+{
+ int result;
+ struct cryptcompress_info * info;
+
+ assert("edward-724", inode != NULL);
+ assert("edward-725", cryptcompress_inode_ok(inode));
+ assert("edward-1547", nr_dirtied <= cluster_nrpages(inode));
+
+ /* set next window params */
+ move_update_window(inode, clust, off, to_file);
+
+ result = update_sd_cryptcompress(inode);
+ if (result)
+ return result;
+ assert("edward-726", clust->hint->lh.owner == NULL);
+ info = cryptcompress_inode_data(inode);
+
+ if (nr_dirtied == 0)
+ return 0;
+ mutex_unlock(&info->checkin_mutex);
+ reiser4_throttle_write(inode);
+ mutex_lock(&info->checkin_mutex);
+ return 0;
+}
+
+/*
+ * Check in part of a hole within a logical cluster
+ */
+static int write_hole(struct inode *inode, struct cluster_handle * clust,
+ loff_t file_off, loff_t to_file)
+{
+ int result = 0;
+ unsigned cl_off, cl_count = 0;
+ unsigned to_pg, pg_off;
+ struct reiser4_slide * win;
+
+ assert("edward-190", clust != NULL);
+ assert("edward-1069", clust->win != NULL);
+ assert("edward-191", inode != NULL);
+ assert("edward-727", cryptcompress_inode_ok(inode));
+ assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
+ assert("edward-1154",
+ ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
+
+ win = clust->win;
+
+ assert("edward-1070", win != NULL);
+ assert("edward-201", win->stat == HOLE_WINDOW);
+ assert("edward-192", cluster_ok(clust, inode));
+
+ if (win->off == 0 && win->count == inode_cluster_size(inode)) {
+ /*
+ * This part of the hole occupies the whole logical
+ * cluster, so it won't be represented by any items.
+ * Nothing to submit.
+ */
+ move_update_window(inode, clust, file_off, to_file);
+ return 0;
+ }
+ /*
+ * This part of the hole starts not at logical cluster
+ * boundary, so it has to be converted to zeros and written to disk
+ */
+ cl_count = win->count; /* number of zeroes to write */
+ cl_off = win->off;
+ pg_off = off_to_pgoff(win->off);
+
+ while (cl_count) {
+ struct page *page;
+ page = clust->pages[off_to_pg(cl_off)];
+
+ assert("edward-284", page != NULL);
+
+ to_pg = min((typeof(pg_off))PAGE_SIZE - pg_off, cl_count);
+ lock_page(page);
+ zero_user(page, pg_off, to_pg);
+ SetPageUptodate(page);
+ set_page_dirty_notag(page);
+ mark_page_accessed(page);
+ unlock_page(page);
+
+ cl_off += to_pg;
+ cl_count -= to_pg;
+ pg_off = 0;
+ }
+ if (win->delta == 0) {
+ /* only zeroes in this window, try to capture
+ */
+ result = checkin_logical_cluster(clust, inode);
+ if (result)
+ return result;
+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
+ result = balance_dirty_page_cluster(clust,
+ inode, file_off, to_file,
+ win_count_to_nrpages(win));
+ } else
+ move_update_window(inode, clust, file_off, to_file);
+ return result;
+}
+
+/*
+ The main disk search procedure for cryptcompress plugin, which
+ . scans all items of disk cluster with the lock mode @mode
+ . maybe reads each one (if @read)
+ . maybe makes its znode dirty (if write lock mode was specified)
+
+ NOTE-EDWARD: Callers should handle the case when disk cluster
+ is incomplete (-EIO)
+*/
+int find_disk_cluster(struct cluster_handle * clust,
+ struct inode *inode, int read, znode_lock_mode mode)
+{
+ flow_t f;
+ hint_t *hint;
+ int result = 0;
+ int was_grabbed;
+ ra_info_t ra_info;
+ file_plugin *fplug;
+ item_plugin *iplug;
+ struct tfm_cluster *tc;
+ struct cryptcompress_info *info;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ assert("edward-138", clust != NULL);
+ assert("edward-728", clust->hint != NULL);
+ assert("edward-226", reiser4_schedulable());
+ assert("edward-137", inode != NULL);
+ assert("edward-729", cryptcompress_inode_ok(inode));
+
+ hint = clust->hint;
+ fplug = inode_file_plugin(inode);
+ was_grabbed = ctx_subvol_grabbed(get_current_context(), subv->id);
+ info = cryptcompress_inode_data(inode);
+ tc = &clust->tc;
+
+ assert("edward-462", !tfm_cluster_is_uptodate(tc));
+ assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
+
+ dclust_init_extension(hint);
+
+ /*
+ * set key of the first disk cluster item
+ */
+ flow_by_inode_cryptcompress(inode,
+ (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
+ 0 /* kernel space */ ,
+ inode_scaled_cluster_size(inode),
+ clust_to_off(clust->index, inode), READ_OP, &f);
+
+ if (mode == ZNODE_WRITE_LOCK) {
+ /*
+ * reserve for flush to make dirty all the leaf nodes
+ * which contain disk cluster
+ */
+ result = reiser4_grab_space_force(estimate_dirty_cluster(inode),
+ BA_CAN_COMMIT, subv);
+ if (result)
+ goto out;
+ }
+
+ ra_info.key_to_stop = f.key;
+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
+
+ while (f.length) {
+ result = find_cluster_item(hint, &f.key, mode,
+ NULL, FIND_EXACT,
+ (mode == ZNODE_WRITE_LOCK ?
+ CBK_FOR_INSERT : 0), subv);
+ switch (result) {
+ case CBK_COORD_NOTFOUND:
+ result = 0;
+ if (inode_scaled_offset
+ (inode, clust_to_off(clust->index, inode)) ==
+ get_key_offset(&f.key)) {
+ /* first item not found, this is treated
+ as disk cluster is absent */
+ clust->dstat = FAKE_DISK_CLUSTER;
+ goto out;
+ }
+ /* we are outside the cluster, stop search here */
+ assert("edward-146",
+ f.length != inode_scaled_cluster_size(inode));
+ goto ok;
+ case CBK_COORD_FOUND:
+ assert("edward-148",
+ hint->ext_coord.coord.between == AT_UNIT);
+ assert("edward-460",
+ hint->ext_coord.coord.unit_pos == 0);
+
+ coord_clear_iplug(&hint->ext_coord.coord);
+ result = zload_ra(hint->ext_coord.coord.node, &ra_info);
+ if (unlikely(result))
+ goto out;
+ iplug = item_plugin_by_coord(&hint->ext_coord.coord);
+ assert("edward-147",
+ item_id_by_coord(&hint->ext_coord.coord) ==
+ CTAIL_ID);
+
+ result = read_ctail(NULL, &f, hint);
+ if (result) {
+ zrelse(hint->ext_coord.coord.node);
+ goto out;
+ }
+ if (mode == ZNODE_WRITE_LOCK) {
+ /* Don't make dirty more nodes then it was
+ estimated (see comments before
+ estimate_dirty_cluster). Missed nodes will be
+ read up in flush time if they are evicted from
+ memory */
+ if (dclust_get_extension_ncount(hint) <=
+ estimate_dirty_cluster(inode))
+ znode_make_dirty(hint->ext_coord.coord.node);
+
+ znode_set_convertible(hint->ext_coord.coord.
+ node);
+ }
+ zrelse(hint->ext_coord.coord.node);
+ break;
+ default:
+ goto out;
+ }
+ }
+ ok:
+ /* at least one item was found */
+ /* NOTE-EDWARD: Callers should handle the case
+ when disk cluster is incomplete (-EIO) */
+ tc->len = inode_scaled_cluster_size(inode) - f.length;
+ tc->lsize = lbytes(clust->index, inode);
+ assert("edward-1196", tc->len > 0);
+ assert("edward-1406", tc->lsize > 0);
+
+ if (hint_is_unprepped_dclust(clust->hint)) {
+ clust->dstat = UNPR_DISK_CLUSTER;
+ } else if (clust->index == info->trunc_index) {
+ clust->dstat = TRNC_DISK_CLUSTER;
+ } else {
+ clust->dstat = PREP_DISK_CLUSTER;
+ dclust_set_extension_dsize(clust->hint, tc->len);
+ }
+ out:
+ grabbed2free_mark(was_grabbed, subv);
+
+ return result;
+}
+
+int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
+ znode_lock_mode lock_mode)
+{
+ reiser4_key key;
+ ra_info_t ra_info;
+
+ assert("edward-730", reiser4_schedulable());
+ assert("edward-731", clust != NULL);
+ assert("edward-732", inode != NULL);
+
+ if (hint_is_valid(clust->hint)) {
+ assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
+ assert("edward-1294",
+ znode_is_write_locked(clust->hint->lh.node));
+ /* already have a valid locked position */
+ return (clust->dstat ==
+ FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
+ CBK_COORD_FOUND);
+ }
+ build_body_key_cryptcompress(inode, clust_to_off(clust->index, inode),
+ &key);
+ ra_info.key_to_stop = key;
+ set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
+
+ return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
+ CBK_FOR_INSERT, get_meta_subvol());
+}
+
+/* Read needed cluster pages before modifying.
+ If success, @clust->hint contains locked position in the tree.
+ Also:
+ . find and set disk cluster state
+ . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
+*/
+static int read_some_cluster_pages(struct inode * inode,
+ struct cluster_handle * clust)
+{
+ int i;
+ int result = 0;
+ item_plugin *iplug;
+ struct reiser4_slide * win = clust->win;
+ znode_lock_mode mode = ZNODE_WRITE_LOCK;
+
+ iplug = item_plugin_by_id(CTAIL_ID);
+
+ assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
+
+#if REISER4_DEBUG
+ if (clust->nr_pages == 0) {
+ /* start write hole from fake disk cluster */
+ assert("edward-1117", win != NULL);
+ assert("edward-1118", win->stat == HOLE_WINDOW);
+ assert("edward-1119", new_logical_cluster(clust, inode));
+ }
+#endif
+ if (new_logical_cluster(clust, inode)) {
+ /*
+ new page cluster is about to be written, nothing to read,
+ */
+ assert("edward-734", reiser4_schedulable());
+ assert("edward-735", clust->hint->lh.owner == NULL);
+
+ if (clust->nr_pages) {
+ int off;
+ struct page * pg;
+ assert("edward-1419", clust->pages != NULL);
+ pg = clust->pages[clust->nr_pages - 1];
+ assert("edward-1420", pg != NULL);
+ off = off_to_pgoff(win->off+win->count+win->delta);
+ if (off) {
+ lock_page(pg);
+ zero_user_segment(pg, off, PAGE_SIZE);
+ unlock_page(pg);
+ }
+ }
+ clust->dstat = FAKE_DISK_CLUSTER;
+ return 0;
+ }
+ /*
+ Here we should search for disk cluster to figure out its real state.
+ Also there is one more important reason to do disk search: we need
+ to make disk cluster _dirty_ if it exists
+ */
+
+ /* if windows is specified, read the only pages
+ that will be modified partially */
+
+ for (i = 0; i < clust->nr_pages; i++) {
+ struct page *pg = clust->pages[i];
+
+ lock_page(pg);
+ if (PageUptodate(pg)) {
+ unlock_page(pg);
+ continue;
+ }
+ unlock_page(pg);
+
+ if (win &&
+ i >= size_in_pages(win->off) &&
+ i < off_to_pg(win->off + win->count + win->delta))
+ /* page will be completely overwritten */
+ continue;
+
+ if (win && (i == clust->nr_pages - 1) &&
+ /* the last page is
+ partially modified,
+ not uptodate .. */
+ (size_in_pages(i_size_read(inode)) <= pg->index)) {
+ /* .. and appended,
+ so set zeroes to the rest */
+ int offset;
+ lock_page(pg);
+ assert("edward-1260",
+ size_in_pages(win->off + win->count +
+ win->delta) - 1 == i);
+
+ offset =
+ off_to_pgoff(win->off + win->count + win->delta);
+ zero_user_segment(pg, offset, PAGE_SIZE);
+ unlock_page(pg);
+ /* still not uptodate */
+ break;
+ }
+ lock_page(pg);
+ result = do_readpage_ctail(inode, clust, pg, mode);
+
+ assert("edward-1526", ergo(!result, PageUptodate(pg)));
+ unlock_page(pg);
+ if (result) {
+ warning("edward-219", "do_readpage_ctail failed");
+ goto out;
+ }
+ }
+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
+ /* disk cluster unclaimed, but we need to make its znodes dirty
+ * to make flush update convert its content
+ */
+ result = find_disk_cluster(clust, inode,
+ 0 /* do not read items */,
+ mode);
+ }
+ out:
+ tfm_cluster_clr_uptodate(&clust->tc);
+ return result;
+}
+
+static int should_create_unprepped_cluster(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ assert("edward-737", clust != NULL);
+
+ switch (clust->dstat) {
+ case PREP_DISK_CLUSTER:
+ case UNPR_DISK_CLUSTER:
+ return 0;
+ case FAKE_DISK_CLUSTER:
+ if (clust->win &&
+ clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
+ assert("edward-1172",
+ new_logical_cluster(clust, inode));
+ return 0;
+ }
+ return 1;
+ default:
+ impossible("edward-1173", "bad disk cluster state");
+ return 0;
+ }
+}
+
+static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
+ struct inode *inode)
+{
+ int result;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ assert("edward-1123", reiser4_schedulable());
+ assert("edward-737", clust != NULL);
+ assert("edward-738", inode != NULL);
+ assert("edward-739", cryptcompress_inode_ok(inode));
+ assert("edward-1053", clust->hint != NULL);
+
+ if (!should_create_unprepped_cluster(clust, inode)) {
+ if (clust->reserved) {
+ cluster_reserved2free(estimate_insert_cluster(inode),
+ subv);
+#if REISER4_DEBUG
+ assert("edward-1267",
+ clust->reserved_unprepped ==
+ estimate_insert_cluster(inode));
+ clust->reserved_unprepped -=
+ estimate_insert_cluster(inode);
+#endif
+ }
+ return 0;
+ }
+ assert("edward-1268", clust->reserved);
+ cluster_reserved2grabbed(estimate_insert_cluster(inode), subv);
+#if REISER4_DEBUG
+ assert("edward-1441",
+ clust->reserved_unprepped == estimate_insert_cluster(inode));
+ clust->reserved_unprepped -= estimate_insert_cluster(inode);
+#endif
+ result = ctail_insert_unprepped_cluster(clust, inode);
+ if (result)
+ return result;
+
+ inode_add_bytes(inode, inode_cluster_size(inode));
+
+ assert("edward-743", cryptcompress_inode_ok(inode));
+ assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
+
+ clust->dstat = UNPR_DISK_CLUSTER;
+ return 0;
+}
+
+/* . Grab page cluster for read, write, setattr, etc. operations;
+ * . Truncate its complete pages, if needed;
+ */
+int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
+ rw_op rw)
+{
+ assert("edward-177", inode != NULL);
+ assert("edward-741", cryptcompress_inode_ok(inode));
+ assert("edward-740", clust->pages != NULL);
+
+ set_cluster_nrpages(clust, inode);
+ reset_cluster_pgset(clust, cluster_nrpages(inode));
+ return grab_page_cluster(inode, clust, rw);
+}
+
+/* Truncate complete page cluster of index @index.
+ * This is called by ->kill_hook() method of item
+ * plugin when deleting a disk cluster of such index.
+ */
+void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
+ int even_cows)
+{
+ int found;
+ int nr_pages;
+ jnode *node;
+ pgoff_t page_index = clust_to_pg(index, inode);
+ struct page *pages[MAX_CLUSTER_NRPAGES];
+ reiser4_subvol *subv = get_meta_subvol();
+
+ node = jlookup(get_inode_oid(inode), clust_to_pg(index, inode));
+ nr_pages = size_in_pages(lbytes(index, inode));
+ assert("edward-1483", nr_pages != 0);
+ if (!node)
+ goto truncate;
+ found = find_get_pages(inode->i_mapping, &page_index,
+ cluster_nrpages(inode), pages);
+ if (!found) {
+ assert("edward-1484", jnode_truncate_ok(inode, index));
+ return;
+ }
+ lock_cluster(node);
+
+ if (reiser4_inode_get_flag(inode, REISER4_FILE_IN_CONVERSION)
+ && index == 0)
+ /* converting to unix_file is in progress */
+ JF_CLR(node, JNODE_CLUSTER_PAGE);
+ if (JF_ISSET(node, JNODE_DIRTY)) {
+ /*
+ * @nr_pages were checked in, but not yet checked out -
+ * we need to release them. (also there can be pages
+ * attached to page cache by read(), etc. - don't take
+ * them into account).
+ */
+ assert("edward-1198", found >= nr_pages);
+
+ /* free disk space grabbed for disk cluster converting */
+ cluster_reserved2grabbed(estimate_update_cluster(inode), subv);
+ grabbed2free(get_current_context(),
+ get_current_super_private(),
+ estimate_update_cluster(inode), subv);
+ __put_page_cluster(0, nr_pages, pages, inode);
+
+ /* This will clear dirty bit, uncapture and unlock jnode */
+ unlock_cluster_uncapture(node);
+ } else
+ unlock_cluster(node);
+ jput(node); /* jlookup */
+ put_found_pages(pages, found); /* find_get_pages */
+ truncate:
+ if (reiser4_inode_get_flag(inode, REISER4_FILE_IN_CONVERSION) &&
+ index == 0)
+ return;
+ truncate_page_cluster_range(inode, pages, index, 0,
+ cluster_nrpages(inode),
+ even_cows);
+ assert("edward-1201",
+ ergo(!reiser4_inode_get_flag(inode,
+ REISER4_FILE_IN_CONVERSION),
+ jnode_truncate_ok(inode, index)));
+ return;
+}
+
+/*
+ * Set cluster handle @clust of a logical cluster before
+ * modifications which are supposed to be committed.
+ *
+ * . grab cluster pages;
+ * . reserve disk space;
+ * . maybe read pages from disk and set the disk cluster dirty;
+ * . maybe write hole and check in (partially zeroed) logical cluster;
+ * . create 'unprepped' disk cluster for new or fake logical one.
+ */
+static int prepare_logical_cluster(struct inode *inode,
+ loff_t file_off, /* write position
+ in the file */
+ loff_t to_file, /* bytes of users data
+ to write to the file */
+ struct cluster_handle * clust,
+ logical_cluster_op op)
+{
+ int result = 0;
+ struct reiser4_slide * win = clust->win;
+
+ reset_cluster_params(clust);
+ cluster_set_tfm_act(&clust->tc, TFMA_READ);
+#if REISER4_DEBUG
+ clust->ctx = get_current_context();
+#endif
+ assert("edward-1190", op != LC_INVAL);
+
+ clust->op = op;
+
+ result = prepare_page_cluster(inode, clust, WRITE_OP);
+ if (result)
+ return result;
+ assert("edward-1447",
+ ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
+ assert("edward-1448",
+ ergo(clust->nr_pages != 0,
+ jnode_is_cluster_page(jprivate(clust->pages[0]))));
+
+ result = reserve4cluster(inode, clust);
+ if (result)
+ goto out;
+
+ result = read_some_cluster_pages(inode, clust);
+
+ if (result ||
+ /*
+ * don't submit data modifications
+ * when expanding or shrinking holes
+ */
+ (op == LC_SHRINK && clust->dstat == FAKE_DISK_CLUSTER) ||
+ (op == LC_EXPAND && clust->dstat == FAKE_DISK_CLUSTER)){
+ free_reserved4cluster(inode,
+ clust,
+ estimate_update_cluster(inode) +
+ estimate_insert_cluster(inode));
+ goto out;
+ }
+ assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
+
+ result = cryptcompress_make_unprepped_cluster(clust, inode);
+ if (result)
+ goto error;
+ if (win && win->stat == HOLE_WINDOW) {
+ result = write_hole(inode, clust, file_off, to_file);
+ if (result)
+ goto error;
+ }
+ return 0;
+ error:
+ free_reserved4cluster(inode, clust,
+ estimate_update_cluster(inode));
+ out:
+ put_page_cluster(clust, inode, WRITE_OP);
+ return result;
+}
+
+/* set window by two offsets */
+static void set_window(struct cluster_handle * clust,
+ struct reiser4_slide * win, struct inode *inode,
+ loff_t o1, loff_t o2)
+{
+ assert("edward-295", clust != NULL);
+ assert("edward-296", inode != NULL);
+ assert("edward-1071", win != NULL);
+ assert("edward-297", o1 <= o2);
+
+ clust->index = off_to_clust(o1, inode);
+
+ win->off = off_to_cloff(o1, inode);
+ win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
+ o2 - o1);
+ win->delta = 0;
+
+ clust->win = win;
+}
+
+static int set_window_and_cluster(struct inode *inode,
+ struct cluster_handle * clust,
+ struct reiser4_slide * win, size_t length,
+ loff_t file_off)
+{
+ int result;
+
+ assert("edward-197", clust != NULL);
+ assert("edward-1072", win != NULL);
+ assert("edward-198", inode != NULL);
+
+ result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
+ if (result)
+ return result;
+
+ if (file_off > i_size_read(inode)) {
+ /* Uhmm, hole in cryptcompress file... */
+ loff_t hole_size;
+ hole_size = file_off - inode->i_size;
+
+ set_window(clust, win, inode, inode->i_size, file_off);
+ win->stat = HOLE_WINDOW;
+ if (win->off + hole_size < inode_cluster_size(inode))
+ /* there is also user's data to append to the hole */
+ win->delta = min(inode_cluster_size(inode) -
+ (win->off + win->count), length);
+ return 0;
+ }
+ set_window(clust, win, inode, file_off, file_off + length);
+ win->stat = DATA_WINDOW;
+ return 0;
+}
+
+int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
+ int count)
+{
+ int result = 0;
+ int (*setting_actor)(struct cluster_handle * clust, int count);
+
+ assert("edward-1358", clust != NULL);
+ assert("edward-1359", page != NULL);
+ assert("edward-1360", page->mapping != NULL);
+ assert("edward-1361", page->mapping->host != NULL);
+
+ setting_actor =
+ (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
+ result = setting_actor(clust, count);
+ clust->index = pg_to_clust(page->index, page->mapping->host);
+ return result;
+}
+
+/* reset all the params that not get updated */
+void reset_cluster_params(struct cluster_handle * clust)
+{
+ assert("edward-197", clust != NULL);
+
+ clust->dstat = INVAL_DISK_CLUSTER;
+ clust->tc.uptodate = 0;
+ clust->tc.len = 0;
+}
+
+/* the heart of write_cryptcompress */
+static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
+ const char __user *buf, size_t to_write,
+ loff_t pos, struct dispatch_context *cont)
+{
+ int i;
+ hint_t *hint;
+ int result = 0;
+ size_t count;
+ struct reiser4_slide win;
+ struct cluster_handle clust;
+ struct cryptcompress_info * info;
+
+ assert("edward-154", buf != NULL);
+ assert("edward-161", reiser4_schedulable());
+ assert("edward-748", cryptcompress_inode_ok(inode));
+ assert("edward-159", current_blocksize == PAGE_SIZE);
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL)
+ return RETERR(-ENOMEM);
+
+ result = load_file_hint(file, hint);
+ if (result) {
+ kfree(hint);
+ return result;
+ }
+ count = to_write;
+
+ reiser4_slide_init(&win);
+ cluster_init_read(&clust, &win);
+ clust.hint = hint;
+ info = cryptcompress_inode_data(inode);
+
+ mutex_lock(&info->checkin_mutex);
+
+ result = set_window_and_cluster(inode, &clust, &win, to_write, pos);
+ if (result)
+ goto out;
+
+ if (next_window_stat(&win) == HOLE_WINDOW) {
+ /* write hole in this iteration
+ separated from the loop below */
+ result = write_dispatch_hook(file, inode,
+ pos, &clust, cont);
+ if (result)
+ goto out;
+ result = prepare_logical_cluster(inode, pos, count, &clust,
+ LC_APPOV);
+ if (result)
+ goto out;
+ }
+ do {
+ const char __user * src;
+ unsigned page_off, to_page;
+
+ assert("edward-750", reiser4_schedulable());
+
+ result = write_dispatch_hook(file, inode,
+ pos + to_write - count,
+ &clust, cont);
+ if (result)
+ goto out;
+ if (cont->state == DISPATCH_ASSIGNED_NEW)
+ /* done_lh was called in write_dispatch_hook */
+ goto out_no_longterm_lock;
+
+ result = prepare_logical_cluster(inode, pos, count, &clust,
+ LC_APPOV);
+ if (result)
+ goto out;
+
+ assert("edward-751", cryptcompress_inode_ok(inode));
+ assert("edward-204", win.stat == DATA_WINDOW);
+ assert("edward-1288", hint_is_valid(clust.hint));
+ assert("edward-752",
+ znode_is_write_locked(hint->ext_coord.coord.node));
+ put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
+
+ /* set write position in page */
+ page_off = off_to_pgoff(win.off);
+
+ /* copy user's data to cluster pages */
+ for (i = off_to_pg(win.off), src = buf;
+ i < size_in_pages(win.off + win.count);
+ i++, src += to_page) {
+ to_page = __mbp(win.off + win.count, i) - page_off;
+ assert("edward-1039",
+ page_off + to_page <= PAGE_SIZE);
+ assert("edward-287", clust.pages[i] != NULL);
+
+ fault_in_pages_readable(src, to_page);
+
+ lock_page(clust.pages[i]);
+ result =
+ __copy_from_user((char *)kmap(clust.pages[i]) +
+ page_off, src, to_page);
+ kunmap(clust.pages[i]);
+ if (unlikely(result)) {
+ unlock_page(clust.pages[i]);
+ result = -EFAULT;
+ goto err2;
+ }
+ SetPageUptodate(clust.pages[i]);
+ set_page_dirty_notag(clust.pages[i]);
+ flush_dcache_page(clust.pages[i]);
+ mark_page_accessed(clust.pages[i]);
+ unlock_page(clust.pages[i]);
+ page_off = 0;
+ }
+ assert("edward-753", cryptcompress_inode_ok(inode));
+
+ result = checkin_logical_cluster(&clust, inode);
+ if (result)
+ goto err2;
+
+ buf += win.count;
+ count -= win.count;
+
+ result = balance_dirty_page_cluster(&clust, inode, 0, count,
+ win_count_to_nrpages(&win));
+ if (result)
+ goto err1;
+ assert("edward-755", hint->lh.owner == NULL);
+ reset_cluster_params(&clust);
+ continue;
+ err2:
+ put_page_cluster(&clust, inode, WRITE_OP);
+ err1:
+ if (clust.reserved)
+ free_reserved4cluster(inode,
+ &clust,
+ estimate_update_cluster(inode));
+ break;
+ } while (count);
+ out:
+ done_lh(&hint->lh);
+ save_file_hint(file, hint);
+ out_no_longterm_lock:
+ mutex_unlock(&info->checkin_mutex);
+ kfree(hint);
+ put_cluster_handle(&clust);
+ assert("edward-195",
+ ergo((to_write == count),
+ (result < 0 || cont->state == DISPATCH_ASSIGNED_NEW)));
+ return (to_write - count) ? (to_write - count) : result;
+}
+
+/**
+ * plugin->write()
+ * @file: file to write to
+ * @buf: address of user-space buffer
+ * @read_amount: number of bytes to write
+ * @off: position in file to write to
+ */
+ssize_t write_cryptcompress(struct file *file, const char __user *buf,
+ size_t count, loff_t *off,
+ struct dispatch_context *cont)
+{
+ ssize_t result;
+ struct inode *inode;
+ reiser4_context *ctx;
+ loff_t pos = *off;
+ struct cryptcompress_info *info;
+
+ assert("edward-1449", cont->state == DISPATCH_INVAL_STATE);
+
+ inode = file_inode(file);
+ assert("edward-196", cryptcompress_inode_ok(inode));
+
+ info = cryptcompress_inode_data(inode);
+ ctx = get_current_context();
+
+ result = file_remove_privs(file);
+ if (unlikely(result != 0)) {
+ context_set_commit_async(ctx);
+ return result;
+ }
+ /* remove_suid might create a transaction */
+ reiser4_txn_restart(ctx);
+
+ result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
+
+ if (unlikely(result < 0)) {
+ context_set_commit_async(ctx);
+ return result;
+ }
+ /* update position in a file */
+ *off = pos + result;
+ return result;
+}
+
+/* plugin->readpages */
+int readpages_cryptcompress(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ reiser4_context * ctx;
+ int ret;
+
+ ctx = reiser4_init_context(mapping->host->i_sb);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto err;
+ }
+ /* cryptcompress file can be built of ctail items only */
+ ret = readpages_ctail(file, mapping, pages);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ if (ret) {
+err:
+ put_pages_list(pages);
+ }
+ return ret;
+}
+
+static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
+{
+ /* reserve one block to update stat data item */
+ assert("edward-1193",
+ inode_file_plugin(inode)->estimate.update ==
+ estimate_update_common);
+ return estimate_update_common(inode);
+}
+
+/**
+ * plugin->read
+ * @file: file to read from
+ * @buf: address of user-space buffer
+ * @read_amount: number of bytes to read
+ * @off: position in file to read from
+ */
+ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
+ loff_t * off)
+{
+ ssize_t result;
+ struct inode *inode;
+ reiser4_context *ctx;
+ struct cryptcompress_info *info;
+ reiser4_block_nr needed;
+
+ inode = file_inode(file);
+ assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ info = cryptcompress_inode_data(inode);
+ needed = cryptcompress_estimate_read(inode);
+
+ result = reiser4_grab_space(needed, BA_CAN_COMMIT, get_meta_subvol());
+ if (result != 0) {
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ result = new_sync_read(file, buf, size, off);
+
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+
+ return result;
+}
+
+/* Set left coord when unit is not found after node_lookup()
+ This takes into account that there can be holes in a sequence
+ of disk clusters */
+
+static void adjust_left_coord(coord_t * left_coord)
+{
+ switch (left_coord->between) {
+ case AFTER_UNIT:
+ left_coord->between = AFTER_ITEM;
+ case AFTER_ITEM:
+ case BEFORE_UNIT:
+ break;
+ default:
+ impossible("edward-1204", "bad left coord to cut");
+ }
+ return;
+}
+
+#define CRC_CUT_TREE_MIN_ITERATIONS 64
+
+/* plugin->cut_tree_worker */
+int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed,
+ struct inode *object, int truncate,
+ int *progress)
+{
+ lock_handle next_node_lock;
+ coord_t left_coord;
+ int result;
+
+ assert("edward-1158", tap->coord->node != NULL);
+ assert("edward-1159", znode_is_write_locked(tap->coord->node));
+ assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
+
+ *progress = 0;
+ init_lh(&next_node_lock);
+
+ while (1) {
+ znode *node; /* node from which items are cut */
+ node_plugin *nplug; /* node plugin for @node */
+
+ node = tap->coord->node;
+
+ /* Move next_node_lock to the next node on the left. */
+ result =
+ reiser4_get_left_neighbor(&next_node_lock, node,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result != 0 && result != -E_NO_NEIGHBOR)
+ break;
+ /* FIXME-EDWARD: Check can we delete the node as a whole. */
+ result = reiser4_tap_load(tap);
+ if (result)
+ return result;
+
+ /* Prepare the second (right) point for cut_node() */
+ if (*progress)
+ coord_init_last_unit(tap->coord, node);
+
+ else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
+ /* set rightmost unit for the items without lookup method */
+ tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
+
+ nplug = node->nplug;
+
+ assert("edward-1161", nplug);
+ assert("edward-1162", nplug->lookup);
+
+ /* left_coord is leftmost unit cut from @node */
+ result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
+
+ if (IS_CBKERR(result))
+ break;
+
+ if (result == CBK_COORD_NOTFOUND)
+ adjust_left_coord(&left_coord);
+
+ /* adjust coordinates so that they are set to existing units */
+ if (coord_set_to_right(&left_coord)
+ || coord_set_to_left(tap->coord)) {
+ result = 0;
+ break;
+ }
+
+ if (coord_compare(&left_coord, tap->coord) ==
+ COORD_CMP_ON_RIGHT) {
+ /* keys from @from_key to @to_key are not in the tree */
+ result = 0;
+ break;
+ }
+
+ /* cut data from one node */
+ *smallest_removed = *reiser4_min_key();
+ result = kill_node_content(&left_coord,
+ tap->coord,
+ from_key,
+ to_key,
+ smallest_removed,
+ next_node_lock.node,
+ object, truncate);
+ reiser4_tap_relse(tap);
+
+ if (result)
+ break;
+
+ ++(*progress);
+
+ /* Check whether all items with keys >= from_key were removed
+ * from the tree. */
+ if (keyle(smallest_removed, from_key))
+ /* result = 0; */
+ break;
+
+ if (next_node_lock.node == NULL)
+ break;
+
+ result = reiser4_tap_move(tap, &next_node_lock);
+ done_lh(&next_node_lock);
+ if (result)
+ break;
+
+ /* Break long cut_tree operation (deletion of a large file) if
+ * atom requires commit. */
+ if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
+ && current_atom_should_commit()) {
+ result = -E_REPEAT;
+ break;
+ }
+ }
+ done_lh(&next_node_lock);
+ return result;
+}
+
+static int expand_cryptcompress(struct inode *inode /* old size */,
+ loff_t new_size)
+{
+ int result = 0;
+ hint_t *hint;
+ lock_handle *lh;
+ loff_t hole_size;
+ int nr_zeroes;
+ struct reiser4_slide win;
+ struct cluster_handle clust;
+
+ assert("edward-1133", inode->i_size < new_size);
+ assert("edward-1134", reiser4_schedulable());
+ assert("edward-1135", cryptcompress_inode_ok(inode));
+ assert("edward-1136", current_blocksize == PAGE_SIZE);
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL)
+ return RETERR(-ENOMEM);
+ hint_init_zero(hint);
+ lh = &hint->lh;
+
+ reiser4_slide_init(&win);
+ cluster_init_read(&clust, &win);
+ clust.hint = hint;
+
+ if (off_to_cloff(inode->i_size, inode) == 0)
+ goto append_hole;
+ /*
+ * It can happen that
+ * a part of the hole will be converted
+ * to zeros. If so, it should be submitted
+ */
+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
+ if (result)
+ goto out;
+ hole_size = new_size - inode->i_size;
+ nr_zeroes = inode_cluster_size(inode) -
+ off_to_cloff(inode->i_size, inode);
+ if (nr_zeroes > hole_size)
+ nr_zeroes = hole_size;
+
+ set_window(&clust, &win, inode, inode->i_size,
+ inode->i_size + nr_zeroes);
+ win.stat = HOLE_WINDOW;
+
+ assert("edward-1137",
+ clust.index == off_to_clust(inode->i_size, inode));
+
+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_EXPAND);
+ if (result)
+ goto out;
+ assert("edward-1139",
+ clust.dstat == PREP_DISK_CLUSTER ||
+ clust.dstat == UNPR_DISK_CLUSTER ||
+ clust.dstat == FAKE_DISK_CLUSTER);
+
+ assert("edward-1431", hole_size >= nr_zeroes);
+
+ append_hole:
+ INODE_SET_SIZE(inode, new_size);
+ out:
+ done_lh(lh);
+ kfree(hint);
+ put_cluster_handle(&clust);
+ return result;
+}
+
+static int update_size_actor(struct inode *inode,
+ loff_t new_size, int update_sd)
+{
+ if (new_size & ((loff_t) (inode_cluster_size(inode)) - 1))
+ /*
+ * cut not at logical cluster boundary,
+ * size will be updated by write_hole()
+ */
+ return 0;
+ else
+ return reiser4_update_file_size(inode, new_size, update_sd);
+}
+
+static int prune_cryptcompress(struct inode *inode,
+ loff_t new_size, int update_sd)
+{
+ int result = 0;
+ unsigned nr_zeros;
+ loff_t to_prune;
+ loff_t old_size;
+ cloff_t from_idx;
+ cloff_t to_idx;
+
+ hint_t *hint;
+ lock_handle *lh;
+ struct reiser4_slide win;
+ struct cluster_handle clust;
+
+ assert("edward-1140", inode->i_size >= new_size);
+ assert("edward-1141", reiser4_schedulable());
+ assert("edward-1142", cryptcompress_inode_ok(inode));
+ assert("edward-1143", current_blocksize == PAGE_SIZE);
+
+ old_size = inode->i_size;
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL)
+ return RETERR(-ENOMEM);
+ hint_init_zero(hint);
+ lh = &hint->lh;
+
+ reiser4_slide_init(&win);
+ cluster_init_read(&clust, &win);
+ clust.hint = hint;
+
+ /*
+ * index of the leftmost logical cluster
+ * that will be completely truncated
+ */
+ from_idx = size_in_lc(new_size, inode);
+ to_idx = size_in_lc(inode->i_size, inode);
+ /*
+ * truncate all complete disk clusters starting from @from_idx
+ */
+ assert("edward-1174", from_idx <= to_idx);
+
+ old_size = inode->i_size;
+ if (from_idx != to_idx) {
+ struct cryptcompress_info *info;
+ info = cryptcompress_inode_data(inode);
+
+ result = cut_file_items(inode,
+ clust_to_off(from_idx, inode),
+ update_sd,
+ clust_to_off(to_idx, inode),
+ update_size_actor);
+ info->trunc_index = ULONG_MAX;
+ if (unlikely(result == CBK_COORD_NOTFOUND))
+ result = 0;
+ if (unlikely(result))
+ goto out;
+ }
+ if (off_to_cloff(new_size, inode) == 0)
+ goto truncate_hole;
+
+ assert("edward-1146", new_size < inode->i_size);
+
+ to_prune = inode->i_size - new_size;
+ /*
+ * Partial truncate of the last logical cluster.
+ * Partial hole will be converted to zeros. The resulted
+ * logical cluster will be captured and submitted to disk
+ */
+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
+ if (result)
+ goto out;
+
+ nr_zeros = off_to_pgoff(new_size);
+ if (nr_zeros)
+ nr_zeros = PAGE_SIZE - nr_zeros;
+
+ set_window(&clust, &win, inode, new_size, new_size + nr_zeros);
+ win.stat = HOLE_WINDOW;
+
+ assert("edward-1149", clust.index == from_idx - 1);
+
+ result = prepare_logical_cluster(inode, 0, 0, &clust, LC_SHRINK);
+ if (result)
+ goto out;
+ assert("edward-1151",
+ clust.dstat == PREP_DISK_CLUSTER ||
+ clust.dstat == UNPR_DISK_CLUSTER ||
+ clust.dstat == FAKE_DISK_CLUSTER);
+ truncate_hole:
+ /*
+ * drop all the pages that don't have jnodes (i.e. pages
+ * which can not be truncated by cut_file_items() because
+ * of holes represented by fake disk clusters) including
+ * the pages of partially truncated cluster which was
+ * released by prepare_logical_cluster()
+ */
+ INODE_SET_SIZE(inode, new_size);
+ truncate_inode_pages(inode->i_mapping, new_size);
+ out:
+ assert("edward-1497",
+ pages_truncate_ok(inode, size_in_pages(new_size)));
+
+ done_lh(lh);
+ kfree(hint);
+ put_cluster_handle(&clust);
+ return result;
+}
+
+/**
+ * Capture a pager cluster.
+ * @clust must be set up by a caller.
+ */
+static int capture_page_cluster(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ int result;
+
+ assert("edward-1073", clust != NULL);
+ assert("edward-1074", inode != NULL);
+ assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
+
+ result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
+ if (result)
+ return result;
+
+ set_cluster_pages_dirty(clust, inode);
+ result = checkin_logical_cluster(clust, inode);
+ put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
+ if (unlikely(result))
+ put_page_cluster(clust, inode, WRITE_OP);
+ return result;
+}
+
+/* Starting from @index find tagged pages of the same page cluster.
+ * Clear the tag for each of them. Return number of found pages.
+ */
+static int find_anon_page_cluster(struct address_space * mapping,
+ pgoff_t * index, struct page ** pages)
+{
+ int i = 0;
+ int found;
+ xa_lock_irq(&mapping->i_pages);
+ do {
+ /* looking for one page */
+ found = radix_tree_gang_lookup_tag(&mapping->i_pages,
+ (void **)&pages[i],
+ *index, 1,
+ PAGECACHE_TAG_REISER4_MOVED);
+ if (!found)
+ break;
+ if (!same_page_cluster(pages[0], pages[i]))
+ break;
+
+ /* found */
+ get_page(pages[i]);
+ *index = pages[i]->index + 1;
+
+ radix_tree_tag_clear(&mapping->i_pages,
+ pages[i]->index,
+ PAGECACHE_TAG_REISER4_MOVED);
+ if (last_page_in_cluster(pages[i++]))
+ break;
+ } while (1);
+ xa_unlock_irq(&mapping->i_pages);
+ return i;
+}
+
+#define MAX_PAGES_TO_CAPTURE (1024)
+
+/* Capture anonymous page clusters */
+static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
+ int to_capture)
+{
+ int count = 0;
+ int found = 0;
+ int result = 0;
+ hint_t *hint;
+ lock_handle *lh;
+ struct inode * inode;
+ struct cluster_handle clust;
+ struct page * pages[MAX_CLUSTER_NRPAGES];
+
+ assert("edward-1127", mapping != NULL);
+ assert("edward-1128", mapping->host != NULL);
+ assert("edward-1440", mapping->host->i_mapping == mapping);
+
+ inode = mapping->host;
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL)
+ return RETERR(-ENOMEM);
+ hint_init_zero(hint);
+ lh = &hint->lh;
+
+ cluster_init_read(&clust, NULL /* no sliding window */);
+ clust.hint = hint;
+
+ result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
+ if (result)
+ goto out;
+
+ while (to_capture > 0) {
+ found = find_anon_page_cluster(mapping, index, pages);
+ if (!found) {
+ *index = (pgoff_t) - 1;
+ break;
+ }
+ move_cluster_forward(&clust, inode, pages[0]->index);
+ result = capture_page_cluster(&clust, inode);
+
+ put_found_pages(pages, found); /* find_anon_page_cluster */
+ if (result)
+ break;
+ to_capture -= clust.nr_pages;
+ count += clust.nr_pages;
+ }
+ if (result) {
+ warning("edward-1077",
+ "Capture failed (inode %llu, result=%i, captured=%d)\n",
+ (unsigned long long)get_inode_oid(inode), result, count);
+ } else {
+ assert("edward-1078", ergo(found > 0, count > 0));
+ if (to_capture <= 0)
+ /* there may be left more pages */
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+ result = count;
+ }
+ out:
+ done_lh(lh);
+ kfree(hint);
+ put_cluster_handle(&clust);
+ return result;
+}
+
+/* Returns true if inode's mapping has dirty pages
+ which do not belong to any atom */
+static int cryptcompress_inode_has_anon_pages(struct inode *inode)
+{
+ int result;
+ xa_lock_irq(&inode->i_mapping->i_pages);
+ result = radix_tree_tagged(&inode->i_mapping->i_pages,
+ PAGECACHE_TAG_REISER4_MOVED);
+ xa_unlock_irq(&inode->i_mapping->i_pages);
+ return result;
+}
+
+/* plugin->writepages */
+int writepages_cryptcompress(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int result = 0;
+ long to_capture;
+ pgoff_t nrpages;
+ pgoff_t index = 0;
+ struct inode *inode;
+ struct cryptcompress_info *info;
+
+ inode = mapping->host;
+ if (!cryptcompress_inode_has_anon_pages(inode))
+ goto end;
+ info = cryptcompress_inode_data(inode);
+ nrpages = size_in_pages(i_size_read(inode));
+
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
+ else
+ to_capture = MAX_PAGES_TO_CAPTURE;
+ do {
+ reiser4_context *ctx;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ result = PTR_ERR(ctx);
+ break;
+ }
+ /* avoid recursive calls to ->sync_inodes */
+ ctx->nobalance = 1;
+
+ assert("edward-1079",
+ lock_stack_isclean(get_current_lock_stack()));
+
+ reiser4_txn_restart_current();
+
+ if (get_current_context()->entd) {
+ if (mutex_trylock(&info->checkin_mutex) == 0) {
+ /* the mutex might be occupied by
+ entd caller */
+ result = RETERR(-EBUSY);
+ reiser4_exit_context(ctx);
+ break;
+ }
+ } else
+ mutex_lock(&info->checkin_mutex);
+
+ result = capture_anon_pages(inode->i_mapping, &index,
+ to_capture);
+ mutex_unlock(&info->checkin_mutex);
+
+ if (result < 0) {
+ reiser4_exit_context(ctx);
+ break;
+ }
+ wbc->nr_to_write -= result;
+ if (wbc->sync_mode != WB_SYNC_ALL) {
+ reiser4_exit_context(ctx);
+ break;
+ }
+ result = txnmgr_force_commit_all(inode->i_sb, 0);
+ reiser4_exit_context(ctx);
+ } while (result >= 0 && index < nrpages);
+
+ end:
+ if (is_in_reiser4_context()) {
+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
+ /* there are already pages to flush, flush them out,
+ do not delay until end of reiser4_sync_inodes */
+ reiser4_writeout(inode->i_sb, wbc);
+ get_current_context()->nr_captured = 0;
+ }
+ }
+ return result;
+}
+
+/* plugin->ioctl */
+int ioctl_cryptcompress(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ return RETERR(-ENOTTY);
+}
+
+/* plugin->mmap */
+int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
+{
+ int result;
+ struct inode *inode;
+ reiser4_context *ctx;
+
+ inode = file_inode(file);
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ /*
+ * generic_file_mmap will do update_atime. Grab space for stat data
+ * update.
+ */
+ result = reiser4_grab_space_force
+ (inode_file_plugin(inode)->estimate.update(inode),
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (result) {
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ result = generic_file_mmap(file, vma);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/* plugin->delete_object */
+int delete_object_cryptcompress(struct inode *inode)
+{
+ int result;
+ struct cryptcompress_info * info;
+
+ assert("edward-429", inode->i_nlink == 0);
+
+ reiser4_txn_restart_current();
+ info = cryptcompress_inode_data(inode);
+
+ mutex_lock(&info->checkin_mutex);
+ result = prune_cryptcompress(inode, 0, 0);
+ mutex_unlock(&info->checkin_mutex);
+
+ if (result) {
+ warning("edward-430",
+ "cannot truncate cryptcompress file %lli: %i",
+ (unsigned long long)get_inode_oid(inode),
+ result);
+ }
+ /* and remove stat data */
+ return reiser4_delete_object_common(inode);
+}
+
+/*
+ * plugin->setattr
+ * This implements actual truncate (see comments in reiser4/page_cache.c)
+ */
+int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
+{
+ int result;
+ struct inode *inode;
+ struct cryptcompress_info * info;
+
+ inode = dentry->d_inode;
+ info = cryptcompress_inode_data(inode);
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (i_size_read(inode) != attr->ia_size) {
+ reiser4_context *ctx;
+ loff_t old_size;
+
+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ result = setattr_dispatch_hook(inode);
+ if (result) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ old_size = i_size_read(inode);
+ inode_check_scale(inode, old_size, attr->ia_size);
+
+ mutex_lock(&info->checkin_mutex);
+ if (attr->ia_size > inode->i_size)
+ result = expand_cryptcompress(inode,
+ attr->ia_size);
+ else
+ result = prune_cryptcompress(inode,
+ attr->ia_size,
+ 1/* update sd */);
+ mutex_unlock(&info->checkin_mutex);
+ if (result) {
+ warning("edward-1192",
+ "truncate_cryptcompress failed: oid %lli, "
+ "old size %lld, new size %lld, retval %d",
+ (unsigned long long)
+ get_inode_oid(inode), old_size,
+ attr->ia_size, result);
+ }
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ } else
+ result = 0;
+ } else
+ result = reiser4_setattr_common(dentry, attr);
+ return result;
+}
+
+/* plugin->release */
+int release_cryptcompress(struct inode *inode, struct file *file)
+{
+ reiser4_context *ctx = reiser4_init_context(inode->i_sb);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ reiser4_free_file_fsdata(file);
+ reiser4_exit_context(ctx);
+ return 0;
+}
+
+/* plugin->write_begin() */
+int write_begin_cryptcompress(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata)
+{
+ int ret = -ENOMEM;
+ char *buf;
+ hint_t *hint;
+ struct inode *inode;
+ struct reiser4_slide *win;
+ struct cluster_handle *clust;
+ struct cryptcompress_info *info;
+ reiser4_context *ctx;
+
+ ctx = get_current_context();
+ inode = page->mapping->host;
+ info = cryptcompress_inode_data(inode);
+
+ assert("edward-1564", PageLocked(page));
+ buf = kmalloc(sizeof(*clust) +
+ sizeof(*win) +
+ sizeof(*hint),
+ reiser4_ctx_gfp_mask_get());
+ if (!buf)
+ goto err2;
+ clust = (struct cluster_handle *)buf;
+ win = (struct reiser4_slide *)(buf + sizeof(*clust));
+ hint = (hint_t *)(buf + sizeof(*clust) + sizeof(*win));
+
+ hint_init_zero(hint);
+ cluster_init_read(clust, NULL);
+ clust->hint = hint;
+
+ mutex_lock(&info->checkin_mutex);
+
+ ret = set_window_and_cluster(inode, clust, win, len, pos);
+ if (ret)
+ goto err1;
+ unlock_page(page);
+ ret = prepare_logical_cluster(inode, pos, len, clust, LC_APPOV);
+ done_lh(&hint->lh);
+ assert("edward-1565", lock_stack_isclean(get_current_lock_stack()));
+ lock_page(page);
+ if (ret) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ goto err0;
+ }
+ /*
+ * Success. All resources (including checkin_mutex)
+ * will be released in ->write_end()
+ */
+ ctx->locked_page = page;
+ *fsdata = (void *)buf;
+
+ return 0;
+ err0:
+ put_cluster_handle(clust);
+ err1:
+ mutex_unlock(&info->checkin_mutex);
+ kfree(buf);
+ err2:
+ assert("edward-1568", !ret);
+ return ret;
+}
+
+/* plugin->write_end() */
+int write_end_cryptcompress(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata)
+{
+ int ret;
+ hint_t *hint;
+ struct inode *inode;
+ struct cluster_handle *clust;
+ struct cryptcompress_info *info;
+ reiser4_context *ctx;
+
+ assert("edward-1566",
+ lock_stack_isclean(get_current_lock_stack()));
+ ctx = get_current_context();
+ inode = page->mapping->host;
+ info = cryptcompress_inode_data(inode);
+ clust = (struct cluster_handle *)fsdata;
+ hint = clust->hint;
+
+ unlock_page(page);
+ ctx->locked_page = NULL;
+ set_cluster_pages_dirty(clust, inode);
+ ret = checkin_logical_cluster(clust, inode);
+ if (ret) {
+ SetPageError(page);
+ goto exit;
+ }
+ exit:
+ mutex_unlock(&info->checkin_mutex);
+
+ put_cluster_handle(clust);
+
+ if (pos + copied > inode->i_size) {
+ /*
+ * i_size has been updated in
+ * checkin_logical_cluster
+ */
+ ret = reiser4_update_sd(inode);
+ if (unlikely(ret != 0))
+ warning("edward-1603",
+ "Can not update stat-data: %i. FSCK?",
+ ret);
+ }
+ kfree(fsdata);
+ return ret;
+}
+
+/* plugin->bmap */
+sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
+{
+ return -EINVAL;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/cryptcompress.h linux-5.10.2/fs/reiser4/plugin/file/cryptcompress.h
--- linux-5.10.2.orig/fs/reiser4/plugin/file/cryptcompress.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/cryptcompress.h 2020-12-23 16:07:46.123813202 +0100
@@ -0,0 +1,621 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* See http://www.namesys.com/cryptcompress_design.html */
+
+#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
+#define __FS_REISER4_CRYPTCOMPRESS_H__
+
+#include "../../page_cache.h"
+#include "../compress/compress.h"
+#include "../crypto/cipher.h"
+
+#include <linux/pagemap.h>
+
+#define MIN_CLUSTER_SHIFT PAGE_SHIFT
+#define MAX_CLUSTER_SHIFT 16
+#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_SHIFT)
+#define DC_CHECKSUM_SIZE 4
+
+#define MIN_LATTICE_FACTOR 1
+#define MAX_LATTICE_FACTOR 32
+
+#define REISER4_CRYPTO 0
+
+/* this mask contains all non-standard plugins that might
+ be present in reiser4-specific part of inode managed by
+ cryptcompress file plugin */
+#define cryptcompress_mask \
+ ((1 << PSET_FILE) | \
+ (1 << PSET_CLUSTER) | \
+ (1 << PSET_CIPHER) | \
+ (1 << PSET_DIGEST) | \
+ (1 << PSET_COMPRESSION) | \
+ (1 << PSET_COMPRESSION_MODE))
+
+#if REISER4_DEBUG
+static inline int cluster_shift_ok(int shift)
+{
+ return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
+}
+#endif
+
+#if REISER4_DEBUG
+#define INODE_PGCOUNT(inode) \
+({ \
+ assert("edward-1530", inode_file_plugin(inode) == \
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
+ atomic_read(&cryptcompress_inode_data(inode)->pgcount); \
+ })
+#define INODE_PGCOUNT_INC(inode) \
+do { \
+ assert("edward-1531", inode_file_plugin(inode) == \
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)); \
+ atomic_inc(&cryptcompress_inode_data(inode)->pgcount); \
+} while (0)
+#define INODE_PGCOUNT_DEC(inode) \
+do { \
+ if (inode_file_plugin(inode) == \
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) \
+ atomic_dec(&cryptcompress_inode_data(inode)->pgcount); \
+} while (0)
+#else
+#define INODE_PGCOUNT(inode) (0)
+#define INODE_PGCOUNT_INC(inode)
+#define INODE_PGCOUNT_DEC(inode)
+#endif /* REISER4_DEBUG */
+
+struct tfm_stream {
+ __u8 *data;
+ size_t size;
+};
+
+typedef enum {
+ INPUT_STREAM,
+ OUTPUT_STREAM,
+ LAST_STREAM
+} tfm_stream_id;
+
+typedef struct tfm_stream * tfm_unit[LAST_STREAM];
+
+static inline __u8 *ts_data(struct tfm_stream * stm)
+{
+ assert("edward-928", stm != NULL);
+ return stm->data;
+}
+
+static inline size_t ts_size(struct tfm_stream * stm)
+{
+ assert("edward-929", stm != NULL);
+ return stm->size;
+}
+
+static inline void set_ts_size(struct tfm_stream * stm, size_t size)
+{
+ assert("edward-930", stm != NULL);
+
+ stm->size = size;
+}
+
+static inline int alloc_ts(struct tfm_stream ** stm)
+{
+ assert("edward-931", stm);
+ assert("edward-932", *stm == NULL);
+
+ *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
+ if (!*stm)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void free_ts(struct tfm_stream * stm)
+{
+ assert("edward-933", !ts_data(stm));
+ assert("edward-934", !ts_size(stm));
+
+ kfree(stm);
+}
+
+static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
+{
+ assert("edward-935", !ts_data(stm));
+ assert("edward-936", !ts_size(stm));
+ assert("edward-937", size != 0);
+
+ stm->data = reiser4_vmalloc(size);
+ if (!stm->data)
+ return -ENOMEM;
+ set_ts_size(stm, size);
+ return 0;
+}
+
+static inline void free_ts_data(struct tfm_stream * stm)
+{
+ assert("edward-938", equi(ts_data(stm), ts_size(stm)));
+
+ if (ts_data(stm))
+ vfree(ts_data(stm));
+ memset(stm, 0, sizeof *stm);
+}
+
+/* Write modes for item conversion in flush convert phase */
+typedef enum {
+ CTAIL_INVAL_CONVERT_MODE = 0,
+ CTAIL_APPEND_ITEM = 1,
+ CTAIL_OVERWRITE_ITEM = 2,
+ CTAIL_CUT_ITEM = 3
+} ctail_convert_mode_t;
+
+typedef enum {
+ LC_INVAL = 0, /* invalid value */
+ LC_APPOV = 1, /* append and/or overwrite */
+ LC_EXPAND = 2, /* expanding truncate */
+ LC_SHRINK = 3 /* shrinking truncate */
+} logical_cluster_op;
+
+/* Transform cluster.
+ * Intermediate state between page cluster and disk cluster
+ * Is used for data transform (compression/encryption)
+ */
+struct tfm_cluster {
+ coa_set coa; /* compression algorithms info */
+ tfm_unit tun; /* plain and transformed streams */
+ tfm_action act;
+ int uptodate;
+ int lsize; /* number of bytes in logical cluster */
+ int len; /* length of the transform stream */
+ unsigned int hole:1; /* should punch hole */
+};
+
+static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
+ tfm_action act)
+{
+ return tc->coa[id][act];
+}
+
+static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
+ tfm_action act, coa_t coa)
+{
+ tc->coa[id][act] = coa;
+}
+
+static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
+{
+ coa_t coa;
+
+ coa = cplug->alloc(tc->act);
+ if (IS_ERR(coa))
+ return PTR_ERR(coa);
+ set_coa(tc, cplug->h.id, tc->act, coa);
+ return 0;
+}
+
+static inline int
+grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
+{
+ return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
+ alloc_coa(tc, cplug) : 0);
+}
+
+static inline void free_coa_set(struct tfm_cluster * tc)
+{
+ tfm_action j;
+ reiser4_compression_id i;
+ compression_plugin *cplug;
+
+ assert("edward-810", tc != NULL);
+
+ for (j = 0; j < TFMA_LAST; j++)
+ for (i = 0; i < LAST_COMPRESSION_ID; i++) {
+ if (!get_coa(tc, i, j))
+ continue;
+ cplug = compression_plugin_by_id(i);
+ assert("edward-812", cplug->free != NULL);
+ cplug->free(get_coa(tc, i, j), j);
+ set_coa(tc, i, j, 0);
+ }
+ return;
+}
+
+static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
+ tfm_stream_id id)
+{
+ return tc->tun[id];
+}
+
+static inline void set_tfm_stream(struct tfm_cluster * tc,
+ tfm_stream_id id, struct tfm_stream * ts)
+{
+ tc->tun[id] = ts;
+}
+
+static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
+{
+ return ts_data(get_tfm_stream(tc, id));
+}
+
+static inline void set_tfm_stream_data(struct tfm_cluster * tc,
+ tfm_stream_id id, __u8 * data)
+{
+ get_tfm_stream(tc, id)->data = data;
+}
+
+static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
+{
+ return ts_size(get_tfm_stream(tc, id));
+}
+
+static inline void
+set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
+{
+ get_tfm_stream(tc, id)->size = size;
+}
+
+static inline int
+alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
+{
+ assert("edward-939", tc != NULL);
+ assert("edward-940", !get_tfm_stream(tc, id));
+
+ tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
+ reiser4_ctx_gfp_mask_get());
+ if (!tc->tun[id])
+ return -ENOMEM;
+ return alloc_ts_data(get_tfm_stream(tc, id), size);
+}
+
+static inline int
+realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
+{
+ assert("edward-941", tfm_stream_size(tc, id) < size);
+ free_ts_data(get_tfm_stream(tc, id));
+ return alloc_ts_data(get_tfm_stream(tc, id), size);
+}
+
+static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
+{
+ free_ts_data(get_tfm_stream(tc, id));
+ free_ts(get_tfm_stream(tc, id));
+ set_tfm_stream(tc, id, 0);
+}
+
+static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
+{
+ return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
+}
+
+static inline void free_tfm_unit(struct tfm_cluster * tc)
+{
+ tfm_stream_id id;
+ for (id = 0; id < LAST_STREAM; id++) {
+ if (!get_tfm_stream(tc, id))
+ continue;
+ free_tfm_stream(tc, id);
+ }
+}
+
+static inline void put_tfm_cluster(struct tfm_cluster * tc)
+{
+ assert("edward-942", tc != NULL);
+ free_coa_set(tc);
+ free_tfm_unit(tc);
+}
+
+static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
+{
+ assert("edward-943", tc != NULL);
+ assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
+ return (tc->uptodate == 1);
+}
+
+static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
+{
+ assert("edward-945", tc != NULL);
+ assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
+ tc->uptodate = 1;
+ return;
+}
+
+static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
+{
+ assert("edward-947", tc != NULL);
+ assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
+ tc->uptodate = 0;
+ return;
+}
+
+static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
+{
+ return (get_tfm_stream(tc, id) &&
+ tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
+}
+
+static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
+{
+ int i;
+ for (i = 0; i < LAST_STREAM; i++)
+ if (!tfm_stream_is_set(tc, i))
+ return 0;
+ return 1;
+}
+
+static inline void alternate_streams(struct tfm_cluster * tc)
+{
+ struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
+
+ set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
+ set_tfm_stream(tc, OUTPUT_STREAM, tmp);
+}
+
+/* Set of states to indicate a kind of data
+ * that will be written to the window */
+typedef enum {
+ DATA_WINDOW, /* user's data */
+ HOLE_WINDOW /* zeroes (such kind of data can be written
+ * if we start to write from offset > i_size) */
+} window_stat;
+
+/* Window (of logical cluster size) discretely sliding along a file.
+ * Is used to locate hole region in a logical cluster to be properly
+ * represented on disk.
+ * We split a write to cryptcompress file into writes to its logical
+ * clusters. Before writing to a logical cluster we set a window, i.e.
+ * calculate values of the following fields:
+ */
+struct reiser4_slide {
+ unsigned off; /* offset to write from */
+ unsigned count; /* number of bytes to write */
+ unsigned delta; /* number of bytes to append to the hole */
+ window_stat stat; /* what kind of data will be written starting
+ from @off */
+};
+
+/* Possible states of a disk cluster */
+typedef enum {
+ INVAL_DISK_CLUSTER, /* unknown state */
+ PREP_DISK_CLUSTER, /* disk cluster got converted by flush
+ * at least 1 time */
+ UNPR_DISK_CLUSTER, /* disk cluster just created and should be
+ * converted by flush */
+ FAKE_DISK_CLUSTER, /* disk cluster doesn't exist neither in memory
+ * nor on disk */
+ TRNC_DISK_CLUSTER /* disk cluster is partially truncated */
+} disk_cluster_stat;
+
+/* The following structure represents various stages of the same logical
+ * cluster of index @index:
+ * . fixed slide
+ * . page cluster (stage in primary cache)
+ * . transform cluster (transition stage)
+ * . disk cluster (stage in secondary cache)
+ * This structure is used in transition and synchronizing operations, e.g.
+ * transform cluster is a transition state when synchronizing page cluster
+ * and disk cluster.
+ * FIXME: Encapsulate page cluster, disk cluster.
+ */
+struct cluster_handle {
+ cloff_t index; /* offset in a file (unit is a cluster size) */
+ int index_valid; /* for validating the index above, if needed */
+ struct file *file; /* host file */
+
+ /* logical cluster */
+ struct reiser4_slide *win; /* sliding window to locate holes */
+ logical_cluster_op op; /* logical cluster operation (truncate or
+ append/overwrite) */
+ /* transform cluster */
+ struct tfm_cluster tc; /* contains all needed info to synchronize
+ page cluster and disk cluster) */
+ /* page cluster */
+ int nr_pages; /* number of pages of current checkin action */
+ int old_nrpages; /* number of pages of last checkin action */
+ struct page **pages; /* attached pages */
+ jnode * node; /* jnode for capture */
+
+ /* disk cluster */
+ hint_t *hint; /* current position in the tree */
+ disk_cluster_stat dstat; /* state of the current disk cluster */
+ int reserved; /* is space for disk cluster reserved */
+#if REISER4_DEBUG
+ reiser4_context *ctx;
+ int reserved_prepped;
+ int reserved_unprepped;
+#endif
+
+};
+
+static inline __u8 * tfm_input_data (struct cluster_handle * clust)
+{
+ return tfm_stream_data(&clust->tc, INPUT_STREAM);
+}
+
+static inline __u8 * tfm_output_data (struct cluster_handle * clust)
+{
+ return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
+}
+
+static inline int reset_cluster_pgset(struct cluster_handle * clust,
+ int nrpages)
+{
+ assert("edward-1057", clust->pages != NULL);
+ memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
+ return 0;
+}
+
+static inline int alloc_cluster_pgset(struct cluster_handle * clust,
+ int nrpages)
+{
+ assert("edward-949", clust != NULL);
+ assert("edward-1362", clust->pages == NULL);
+ assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
+
+ clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
+ reiser4_ctx_gfp_mask_get());
+ if (!clust->pages)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+static inline void move_cluster_pgset(struct cluster_handle *clust,
+ struct page ***pages, int * nr_pages)
+{
+ assert("edward-1545", clust != NULL && clust->pages != NULL);
+ assert("edward-1546", pages != NULL && *pages == NULL);
+ *pages = clust->pages;
+ *nr_pages = clust->nr_pages;
+ clust->pages = NULL;
+}
+
+static inline void free_cluster_pgset(struct cluster_handle * clust)
+{
+ assert("edward-951", clust->pages != NULL);
+ kfree(clust->pages);
+ clust->pages = NULL;
+}
+
+static inline void put_cluster_handle(struct cluster_handle * clust)
+{
+ assert("edward-435", clust != NULL);
+
+ put_tfm_cluster(&clust->tc);
+ if (clust->pages)
+ free_cluster_pgset(clust);
+ memset(clust, 0, sizeof *clust);
+}
+
+static inline void inc_keyload_count(struct reiser4_crypto_info * data)
+{
+ assert("edward-1410", data != NULL);
+ data->keyload_count++;
+}
+
+static inline void dec_keyload_count(struct reiser4_crypto_info * data)
+{
+ assert("edward-1411", data != NULL);
+ assert("edward-1412", data->keyload_count > 0);
+ data->keyload_count--;
+}
+
+static inline int capture_cluster_jnode(jnode * node)
+{
+ return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+}
+
+/* cryptcompress specific part of reiser4_inode */
+struct cryptcompress_info {
+ struct mutex checkin_mutex; /* This is to serialize
+ * checkin_logical_cluster operations */
+ cloff_t trunc_index; /* Index of the leftmost truncated disk
+ * cluster (to resolve races with read) */
+ struct reiser4_crypto_info *crypt;
+ /*
+ * the following 2 fields are controlled by compression mode plugin
+ */
+ int compress_toggle; /* Current status of compressibility */
+ int lattice_factor; /* Factor of dynamic lattice. FIXME: Have
+ * a compression_toggle to keep the factor
+ */
+#if REISER4_DEBUG
+ atomic_t pgcount; /* number of grabbed pages */
+#endif
+};
+
+static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
+{
+ info->compress_toggle = val;
+}
+
+static inline int get_compression_toggle (struct cryptcompress_info * info)
+{
+ return info->compress_toggle;
+}
+
+static inline int compression_is_on(struct cryptcompress_info * info)
+{
+ return get_compression_toggle(info) == 1;
+}
+
+static inline void turn_on_compression(struct cryptcompress_info * info)
+{
+ set_compression_toggle(info, 1);
+}
+
+static inline void turn_off_compression(struct cryptcompress_info * info)
+{
+ set_compression_toggle(info, 0);
+}
+
+static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
+{
+ info->lattice_factor = val;
+}
+
+static inline int get_lattice_factor(struct cryptcompress_info * info)
+{
+ return info->lattice_factor;
+}
+
+struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
+int equal_to_rdk(znode *, const reiser4_key *);
+int goto_right_neighbor(coord_t *, lock_handle *);
+int cryptcompress_inode_ok(struct inode *inode);
+int coord_is_unprepped_ctail(const coord_t * coord);
+extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
+ struct page * page, znode_lock_mode mode);
+extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
+ struct inode * inode);
+extern int readpages_cryptcompress(struct file*, struct address_space*,
+ struct list_head*, unsigned);
+int bind_cryptcompress(struct inode *child, struct inode *parent);
+void destroy_inode_cryptcompress(struct inode * inode);
+int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
+ rw_op rw);
+int write_dispatch_hook(struct file *file, struct inode * inode,
+ loff_t pos, struct cluster_handle * clust,
+ struct dispatch_context * cont);
+int setattr_dispatch_hook(struct inode * inode);
+struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
+void inherit_crypto_info_common(struct inode * parent, struct inode * object,
+ int (*can_inherit)(struct inode * child,
+ struct inode * parent));
+void reiser4_attach_crypto_info(struct inode * inode,
+ struct reiser4_crypto_info * info);
+void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
+struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
+
+static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
+{
+ return info->cipher;
+}
+
+static inline void info_set_cipher(struct reiser4_crypto_info * info,
+ struct crypto_blkcipher * tfm)
+{
+ info->cipher = tfm;
+}
+
+static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
+{
+ return info->digest;
+}
+
+static inline void info_set_digest(struct reiser4_crypto_info * info,
+ struct crypto_hash * tfm)
+{
+ info->digest = tfm;
+}
+
+static inline void put_cluster_page(struct page * page)
+{
+ put_page(page);
+}
+
+#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/file.c linux-5.10.2/fs/reiser4/plugin/file/file.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/file.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/file.c 2020-12-23 16:07:46.124813217 +0100
@@ -0,0 +1,2916 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/*
+ * this file contains implementations of inode/file/address_space/file plugin
+ * operations specific for "unix file plugin" (plugin id is
+ * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
+ * (FORMATTING_ID) or of extent items only (EXTENT40_POINTER_ID) or empty (have
+ * no items but stat data)
+ */
+
+#include "../../inode.h"
+#include "../../super.h"
+#include "../../tree_walk.h"
+#include "../../carry.h"
+#include "../../page_cache.h"
+#include "../object.h"
+#include "../cluster.h"
+#include "../../safe_link.h"
+
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/syscalls.h>
+
+
+static int unpack(struct file *file, struct inode *inode, int forever);
+static void drop_access(struct unix_file_info *);
+static int hint_validate(hint_t *hint, reiser4_tree *tree,
+ const reiser4_key * key, int check_key,
+ znode_lock_mode lock_mode);
+
+/* Get exclusive access and make sure that file is not partially
+ * converted (It may happen that another process is doing tail
+ * conversion. If so, wait until it completes)
+ */
+static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
+ struct inode *inode)
+{
+ do {
+ get_exclusive_access(uf_info);
+ if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
+ break;
+ drop_exclusive_access(uf_info);
+ schedule();
+ } while (1);
+}
+
+/* get unix file plugin specific portion of inode */
+struct unix_file_info *unix_file_inode_data(const struct inode *inode)
+{
+ return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
+}
+
+/**
+ * equal_to_rdk - compare key and znode's right delimiting key
+ * @node: node whose right delimiting key to compare with @key
+ * @key: key to compare with @node's right delimiting key
+ *
+ * Returns true if @key is equal to right delimiting key of @node.
+ */
+int equal_to_rdk(znode *node, const reiser4_key *key)
+{
+ int result;
+
+ read_lock_dk(znode_get_tree(node));
+ result = keyeq(key, znode_get_rd_key(node));
+ read_unlock_dk(znode_get_tree(node));
+ return result;
+}
+
+#if REISER4_DEBUG
+
+/**
+ * equal_to_ldk - compare key and znode's left delimiting key
+ * @node: node whose left delimiting key to compare with @key
+ * @key: key to compare with @node's left delimiting key
+ *
+ * Returns true if @key is equal to left delimiting key of @node.
+ */
+int equal_to_ldk(znode *node, const reiser4_key *key)
+{
+ int result;
+
+ read_lock_dk(znode_get_tree(node));
+ result = keyeq(key, znode_get_ld_key(node));
+ read_unlock_dk(znode_get_tree(node));
+ return result;
+}
+
+/**
+ * check_coord - check whether coord corresponds to key
+ * @coord: coord to check
+ * @key: key @coord has to correspond to
+ *
+ * Returns true if @coord is set as if it was set as result of lookup with @key
+ * in coord->node.
+ */
+static int check_coord(const coord_t *coord, const reiser4_key *key)
+{
+ coord_t twin;
+
+ node_plugin_by_node(coord->node)->lookup(coord->node, key,
+ FIND_MAX_NOT_MORE_THAN, &twin);
+ return coords_equal(coord, &twin);
+}
+
+#endif /* REISER4_DEBUG */
+
+/**
+ * init_uf_coord - initialize extended coord
+ * @uf_coord:
+ * @lh:
+ *
+ *
+ */
+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
+{
+ coord_init_zero(&uf_coord->coord);
+ coord_clear_iplug(&uf_coord->coord);
+ uf_coord->lh = lh;
+ init_lh(lh);
+ memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
+ uf_coord->valid = 0;
+}
+
+void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
+{
+ assert("vs-1333", uf_coord->valid == 0);
+
+ if (coord_is_between_items(&uf_coord->coord))
+ return;
+
+ assert("vs-1348",
+ item_plugin_by_coord(&uf_coord->coord)->s.file.
+ init_coord_extension);
+
+ item_body_by_coord(&uf_coord->coord);
+ item_plugin_by_coord(&uf_coord->coord)->s.file.
+ init_coord_extension(uf_coord, offset);
+}
+
+/**
+ * goto_right_neighbor - lock right neighbor, drop current node lock
+ * @coord:
+ * @lh:
+ *
+ * Obtain lock on right neighbor and drop lock on current node.
+ */
+int goto_right_neighbor(coord_t *coord, lock_handle *lh)
+{
+ int result;
+ lock_handle lh_right;
+
+ assert("vs-1100", znode_is_locked(coord->node));
+
+ init_lh(&lh_right);
+ result = reiser4_get_right_neighbor(&lh_right, coord->node,
+ znode_is_wlocked(coord->node) ?
+ ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result) {
+ done_lh(&lh_right);
+ return result;
+ }
+
+ /*
+ * we hold two longterm locks on neighboring nodes. Unlock left of
+ * them
+ */
+ done_lh(lh);
+
+ coord_init_first_unit_nocheck(coord, lh_right.node);
+ move_lh(lh, &lh_right);
+
+ return 0;
+
+}
+
+/**
+ * set_file_state
+ * @uf_info:
+ * @cbk_result:
+ * @level:
+ *
+ * This is to be used by find_file_item and in find_file_state to
+ * determine real state of file
+ */
+static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
+ tree_level level)
+{
+ if (cbk_errored(cbk_result))
+ /* error happened in find_file_item */
+ return;
+
+ assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
+
+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
+ if (cbk_result == CBK_COORD_NOTFOUND)
+ uf_info->container = UF_CONTAINER_EMPTY;
+ else if (level == LEAF_LEVEL)
+ uf_info->container = UF_CONTAINER_TAILS;
+ else
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ } else {
+ /*
+ * file state is known, check whether it is set correctly if
+ * file is not being tail converted
+ */
+ if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
+ REISER4_PART_IN_CONV)) {
+ assert("vs-1162",
+ ergo(level == LEAF_LEVEL &&
+ cbk_result == CBK_COORD_FOUND,
+ uf_info->container == UF_CONTAINER_TAILS));
+ assert("vs-1165",
+ ergo(level == TWIG_LEVEL &&
+ cbk_result == CBK_COORD_FOUND,
+ uf_info->container == UF_CONTAINER_EXTENTS));
+ }
+ }
+}
+
+int find_file_item_nohint(coord_t *coord, lock_handle *lh,
+ const reiser4_key *key, znode_lock_mode lock_mode,
+ struct inode *inode)
+{
+ return reiser4_object_lookup(meta_subvol_tree(),
+ inode, key, coord, lh, lock_mode,
+ FIND_MAX_NOT_MORE_THAN,
+ TWIG_LEVEL, LEAF_LEVEL,
+ (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
+ (CBK_UNIQUE | CBK_FOR_INSERT),
+ NULL /* ra_info */);
+}
+
+/**
+ * find_file_item - look for file item in the tree
+ * @hint: provides coordinate, lock handle, seal
+ * @key: key for search
+ * @mode: mode of lock to put on returned node
+ * @ra_info:
+ * @inode:
+ *
+ * This finds position in the tree corresponding to @key. It first tries to use
+ * @hint's seal if it is set.
+ */
+int find_file_item(hint_t *hint, const reiser4_key *key,
+ znode_lock_mode lock_mode,
+ struct inode *inode)
+{
+ int result;
+ coord_t *coord;
+ lock_handle *lh;
+
+ assert("nikita-3030", reiser4_schedulable());
+ assert("vs-1707", hint != NULL);
+ assert("vs-47", inode != NULL);
+ assert("edward-2375", inode_file_plugin(inode) !=
+ file_plugin_by_id(STRIPED_FILE_PLUGIN_ID));
+
+ coord = &hint->ext_coord.coord;
+ lh = hint->ext_coord.lh;
+ init_lh(lh);
+
+ result = hint_validate(hint,
+ meta_subvol_tree(),
+ key, 1 /* check key */, lock_mode);
+ if (!result) {
+ if (coord->between == AFTER_UNIT &&
+ equal_to_rdk(coord->node, key)) {
+ result = goto_right_neighbor(coord, lh);
+ if (result == -E_NO_NEIGHBOR)
+ return RETERR(-EIO);
+ if (result)
+ return result;
+ assert("vs-1152", equal_to_ldk(coord->node, key));
+ /*
+ * we moved to different node. Invalidate coord
+ * extension, zload is necessary to init it again
+ */
+ hint->ext_coord.valid = 0;
+ }
+
+ set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
+ znode_get_level(coord->node));
+
+ return CBK_COORD_FOUND;
+ }
+ coord_init_zero(coord);
+ result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
+ set_file_state(unix_file_inode_data(inode), result,
+ znode_get_level(coord->node));
+
+ /* FIXME: we might already have coord extension initialized */
+ hint->ext_coord.valid = 0;
+ return result;
+}
+
+void hint_init_zero(hint_t * hint)
+{
+ memset(hint, 0, sizeof(*hint));
+ init_lh(&hint->lh);
+ hint->ext_coord.lh = &hint->lh;
+}
+
+static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
+{
+ int result;
+ reiser4_key key;
+ coord_t coord;
+ lock_handle lh;
+
+ assert("edward-2086",
+ inode_file_plugin(inode) ==
+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
+ assert("vs-1628", ea_obtained(uf_info));
+
+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
+ build_body_key_unix_file(inode, 0, &key);
+ init_lh(&lh);
+ result = find_file_item_nohint(&coord, &lh, &key,
+ ZNODE_READ_LOCK, inode);
+ set_file_state(uf_info, result, znode_get_level(coord.node));
+ done_lh(&lh);
+ if (!cbk_errored(result))
+ result = 0;
+ } else
+ result = 0;
+ assert("vs-1074",
+ ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
+ reiser4_txn_restart_current();
+ return result;
+}
+
+/**
+ * Estimate and reserve space needed to truncate page
+ * which gets partially truncated:
+ * 1) one block for page itself;
+ * 2) stat-data update (estimate_one_insert_into_item);
+ * 3) one item insertion (estimate_one_insert_into_item)
+ * which may happen if page corresponds to hole extent
+ * and unallocated one will have to be created.
+ *
+ * @inode: object that the partial page belongs to;
+ * @index: index of the partial page.
+ */
+static int reserve_partial_page(struct inode *inode, pgoff_t index)
+{
+ grab_space_enable();
+ return reiser4_grab_reserved(reiser4_get_current_sb(),
+ 1 +
+ 2 * estimate_one_insert_into_item(meta_subvol_tree()),
+ BA_CAN_COMMIT, get_meta_subvol());
+}
+
+/**
+ * estimate and reserve space needed to cut one item and update one stat data
+ * @inode: object to cut;
+ */
+int reserve_cut_iteration(struct inode *inode)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+
+ assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
+ /*
+ * We need to double our estimation now
+ * that we can delete more than one node
+ * FIXME-EDWARD: Not clear why to double.
+ */
+ grab_space_enable();
+ return reiser4_grab_reserved(reiser4_get_current_sb(),
+ 2 *(estimate_one_item_removal(&subv->tree) +
+ estimate_one_insert_into_item(&subv->tree)),
+ BA_CAN_COMMIT, subv);
+}
+
+int reiser4_update_file_size(struct inode *inode, loff_t new_size,
+ int update_sd)
+{
+ int result = 0;
+
+ INODE_SET_SIZE(inode, new_size);
+ if (update_sd) {
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ result = reiser4_update_sd(inode);
+ }
+ return result;
+}
+
+/**
+ * Cut file body starting from the last item until @new_size of
+ * the file is reached. Reserve space and update file stat data
+ * on every single cut from the tree.
+ */
+int cut_file_items(struct inode *inode, loff_t new_size,
+ int update_sd, loff_t cur_size,
+ int (*update_actor) (struct inode *, loff_t, int))
+{
+ reiser4_tree *tree;
+ reiser4_key from_key, to_key;
+ reiser4_key smallest_removed;
+ file_plugin *fplug = inode_file_plugin(inode);
+ int result;
+ int progress = 0;
+
+ assert("vs-1248",
+ fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
+ fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
+
+ tree = meta_subvol_tree();
+ fplug->build_body_key(inode, new_size, &from_key);
+ to_key = from_key;
+ set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
+ /*
+ * this loop normally runs just once
+ */
+ while (1) {
+ result = reserve_cut_iteration(inode);
+ if (result)
+ break;
+
+ result = reiser4_cut_tree_object(tree,
+ &from_key, &to_key,
+ &smallest_removed, inode, 1,
+ &progress);
+ if (result == -E_NO_NEIGHBOR)
+ result = 0;
+ else if (result == -E_REPEAT) {
+ /**
+ * -E_REPEAT is a signal to interrupt a long
+ * file truncation process
+ */
+ if (progress) {
+ result = update_actor(inode,
+ get_key_offset(&smallest_removed),
+ update_sd);
+ if (result)
+ break;
+ }
+ /* the below does up(sbinfo->delete_mutex).
+ * Do not get confused */
+ reiser4_release_reserved(inode->i_sb);
+ /**
+ * reiser4_cut_tree_object() was interrupted probably
+ * because current atom requires commit, we have to
+ * release transaction handle to allow atom commit.
+ */
+ reiser4_txn_restart_current();
+ continue;
+ } else if (result &&
+ !(result == CBK_COORD_NOTFOUND && new_size == 0
+ && inode->i_size == 0))
+ break;
+
+ set_key_offset(&smallest_removed, new_size);
+ /*
+ * Final sd update after the file gets its correct size
+ */
+ result = update_actor(inode, get_key_offset(&smallest_removed),
+ update_sd);
+ break;
+ }
+ /*
+ * the below does up(sbinfo->delete_mutex). Do not get confused
+ */
+ reiser4_release_reserved(inode->i_sb);
+
+ return result;
+}
+
+/**
+ * make file shorter
+ */
+static int shorten_file(struct inode *inode, loff_t new_size)
+{
+ int result;
+ struct page *page;
+ int padd_from;
+ unsigned long index;
+ struct unix_file_info *uf_info;
+
+ /*
+ * cut file body using volume-specific method
+ */
+ result = cut_file_items(inode, new_size,
+ 1, /* update_sd */
+ get_key_offset(reiser4_max_key()),
+ reiser4_update_file_size);
+ if (result)
+ return result;
+
+ uf_info = unix_file_inode_data(inode);
+ assert("vs-1105", new_size == inode->i_size);
+ if (new_size == 0) {
+ uf_info->container = UF_CONTAINER_EMPTY;
+ return 0;
+ }
+
+ result = find_file_state(inode, uf_info);
+ if (result)
+ return result;
+ if (uf_info->container == UF_CONTAINER_TAILS)
+ /*
+ * No need to worry about zeroing last page after new file
+ * end
+ */
+ return 0;
+
+ padd_from = inode->i_size & (PAGE_SIZE - 1);
+ if (!padd_from)
+ /* file is truncated to page boundary */
+ return 0;
+ /*
+ * last page is partially truncated - zero its content
+ */
+ index = (inode->i_size >> PAGE_SHIFT);
+ result = reserve_partial_page(inode, index);
+ if (result) {
+ assert("edward-2294",
+ get_current_super_private()->delete_mutex_owner == NULL);
+ return result;
+ }
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ if (IS_ERR(page)) {
+ /*
+ * the below does up(sbinfo->delete_mutex). Do not get
+ * confused
+ */
+ reiser4_release_reserved(inode->i_sb);
+ if (likely(PTR_ERR(page) == -EINVAL)) {
+ /* looks like file is built of tail items */
+ return 0;
+ }
+ return PTR_ERR(page);
+ }
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ /*
+ * the below does up(sbinfo->delete_mutex). Do not get
+ * confused
+ */
+ reiser4_release_reserved(inode->i_sb);
+ return RETERR(-EIO);
+ }
+
+ /*
+ * if page correspons to hole extent unit - unallocated one will be
+ * created here. This is not necessary
+ */
+ result = find_or_create_extent_unix_file(page);
+
+ /*
+ * FIXME: cut_file_items has already updated inode. Probably it would
+ * be better to update it here when file is really truncated
+ */
+ if (result) {
+ put_page(page);
+ /*
+ * the below does up(sbinfo->delete_mutex). Do not get
+ * confused
+ */
+ reiser4_release_reserved(inode->i_sb);
+ return result;
+ }
+
+ lock_page(page);
+ assert("vs-1066", PageLocked(page));
+ zero_user_segment(page, padd_from, PAGE_SIZE);
+ unlock_page(page);
+ put_page(page);
+ /* the below does up(sbinfo->delete_mutex). Do not get confused */
+ reiser4_release_reserved(inode->i_sb);
+ return 0;
+}
+
+/**
+ * should_have_notail
+ * @uf_info:
+ * @new_size:
+ *
+ * Calls formatting plugin to see whether file of size @new_size has to be
+ * stored in unformatted nodes or in tail items. 0 is returned for later case.
+ */
+static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
+{
+ if (!uf_info->tplug)
+ return 1;
+ return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
+ new_size);
+
+}
+
+/**
+ * change length of file
+ * @inode: inode of file
+ * @new_size: new file length
+ *
+ * Adjusts items file @inode is built of to match @new_size. It may either cut
+ * items or add them to represent a hole at the end of file. The caller has to
+ * obtain exclusive access to the file.
+ */
+static int truncate_body_unix_file(struct inode *inode, struct iattr *attr)
+{
+ int result;
+ loff_t new_size = attr->ia_size;
+
+ if (inode->i_size < new_size) {
+ /* expanding truncate */
+ struct unix_file_info *uf_info = unix_file_inode_data(inode);
+
+ result = find_file_state(inode, uf_info);
+ if (result)
+ return result;
+
+ if (should_have_notail(uf_info, new_size)) {
+ /*
+ * file of size @new_size has to be built of
+ * extents. If it is built of tails - convert to
+ * extents
+ */
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ /*
+ * if file is being convered by another process
+ * - wait until it completes
+ */
+ while (1) {
+ if (reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV)) {
+ drop_exclusive_access(uf_info);
+ schedule();
+ get_exclusive_access(uf_info);
+ continue;
+ }
+ break;
+ }
+
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ result = tail2extent(uf_info);
+ if (result)
+ return result;
+ }
+ }
+ result = write_extent_unix_file(NULL, inode, NULL,
+ 0, &new_size);
+ if (result)
+ return result;
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ } else {
+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
+ result = write_extent_unix_file(NULL, inode,
+ NULL, 0,
+ &new_size);
+ if (result)
+ return result;
+ } else {
+ result = write_tail_unix_file(NULL, inode, NULL,
+ 0, &new_size);
+ if (result)
+ return result;
+ uf_info->container = UF_CONTAINER_TAILS;
+ }
+ }
+ BUG_ON(result > 0);
+ result = reiser4_update_file_size(inode, new_size, 1);
+ BUG_ON(result != 0);
+ } else
+ result = shorten_file(inode, new_size);
+ return result;
+}
+
+/**
+ * load_file_hint - copy hint from struct file to local variable
+ * @file: file to get hint from
+ * @hint: structure to fill
+ *
+ * Reiser4 specific portion of struct file may contain information (hint)
+ * stored on exiting from previous read or write. That information includes
+ * seal of znode and coord within that znode where previous read or write
+ * stopped. This function copies that information to @hint if it was stored or
+ * initializes @hint by 0s otherwise.
+ */
+int load_file_hint(struct file *file, hint_t *hint)
+{
+ reiser4_file_fsdata *fsdata;
+
+ if (file) {
+ fsdata = reiser4_get_file_fsdata(file);
+ if (IS_ERR(fsdata))
+ return PTR_ERR(fsdata);
+
+ spin_lock_inode(file_inode(file));
+ if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
+ memcpy(hint, &fsdata->reg.hint, sizeof(*hint));
+ init_lh(&hint->lh);
+ hint->ext_coord.lh = &hint->lh;
+ spin_unlock_inode(file_inode(file));
+ /*
+ * force re-validation of the coord on the first
+ * iteration of the read/write loop.
+ */
+ hint->ext_coord.valid = 0;
+ assert("nikita-19892",
+ coords_equal(&hint->seal.coord1,
+ &hint->ext_coord.coord));
+ return 0;
+ }
+ memset(&fsdata->reg.hint, 0, sizeof(hint_t));
+ spin_unlock_inode(file_inode(file));
+ }
+ hint_init_zero(hint);
+ return 0;
+}
+
+/**
+ * Copy hint to reiser4 private struct file's part
+ * @file: file to save hint in
+ * @hint: hint to save
+ *
+ * This copies @hint to reiser4 private part of struct file. It can help
+ * speedup future accesses to the file.
+ */
+void save_file_hint(struct file *file, const hint_t *hint)
+{
+ reiser4_file_fsdata *fsdata;
+
+ assert("edward-1337", hint != NULL);
+
+ if (!file || !reiser4_seal_is_set(&hint->seal))
+ return;
+ fsdata = reiser4_get_file_fsdata(file);
+ assert("vs-965", !IS_ERR(fsdata));
+ assert("nikita-19891",
+ coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
+ assert("vs-30", hint->lh.owner == NULL);
+ spin_lock_inode(file_inode(file));
+ fsdata->reg.hint = *hint;
+ spin_unlock_inode(file_inode(file));
+ return;
+}
+
+void reiser4_unset_hint(hint_t * hint)
+{
+ assert("vs-1315", hint);
+ hint->ext_coord.valid = 0;
+ reiser4_seal_done(&hint->seal);
+ done_lh(&hint->lh);
+}
+
+/* coord must be set properly. So, that reiser4_set_hint
+ has nothing to do */
+void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
+ znode_lock_mode mode)
+{
+ ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
+ assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
+
+ reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
+ hint->offset = get_key_offset(key);
+ hint->mode = mode;
+ done_lh(&hint->lh);
+}
+
+int hint_is_set(const hint_t * hint)
+{
+ return reiser4_seal_is_set(&hint->seal);
+}
+
+#if REISER4_DEBUG
+static int all_but_offset_key_eq(const reiser4_key * k1,
+ const reiser4_key * k2)
+{
+ return (get_key_locality(k1) == get_key_locality(k2) &&
+ get_key_type(k1) == get_key_type(k2) &&
+ get_key_band(k1) == get_key_band(k2) &&
+ get_key_ordering(k1) == get_key_ordering(k2) &&
+ get_key_objectid(k1) == get_key_objectid(k2));
+}
+#endif
+
+static int hint_validate(hint_t *hint, reiser4_tree *tree,
+ const reiser4_key *key, int check_key,
+ znode_lock_mode lock_mode)
+{
+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
+ /* hint either not set or set by different operation */
+ return RETERR(-E_REPEAT);
+
+ assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
+
+ if (check_key && get_key_offset(key) != hint->offset)
+ /* hint is set for different key */
+ return RETERR(-E_REPEAT);
+
+ assert("vs-31", hint->ext_coord.lh == &hint->lh);
+ return reiser4_seal_validate(&hint->seal, tree,
+ &hint->ext_coord.coord, key,
+ hint->ext_coord.lh, lock_mode,
+ ZNODE_LOCK_LOPRI);
+}
+
+/**
+ * Look for place at twig level for extent corresponding to page,
+ * call extent's writepage method to create unallocated extent if
+ * it does not exist yet, initialize jnode, capture page
+ */
+int find_or_create_extent_unix_file(struct page *page)
+{
+ int result;
+ struct inode *inode;
+ int plugged_hole;
+
+ jnode *node;
+
+ assert("vs-1065", page->mapping && page->mapping->host);
+
+ inode = page->mapping->host;
+
+ lock_page(page);
+ node = jnode_of_page(page);
+ if (IS_ERR(node)) {
+ unlock_page(page);
+ return PTR_ERR(node);
+ }
+ JF_SET(node, JNODE_WRITE_PREPARED);
+ unlock_page(page);
+ if (node->blocknr == 0) {
+ plugged_hole = 0;
+ result = update_extent_unix_file(inode, node,
+ page_offset(page),
+ &plugged_hole);
+ if (result) {
+ JF_CLR(node, JNODE_WRITE_PREPARED);
+ jput(node);
+ warning("edward-1549",
+ "failed to update extent (%d)", result);
+ return result;
+ }
+ if (plugged_hole)
+ reiser4_update_sd(inode);
+ } else {
+ struct atom_brick_info *abi;
+
+ assert("edward-1982", node->subvol != NULL);
+ spin_lock_jnode(node);
+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ spin_unlock_jnode(node);
+ BUG_ON(result != 0);
+
+ result = check_insert_atom_brick_info(node->subvol->id,
+ &abi);
+ if (result) {
+ jput(node);
+ return result;
+ }
+ spin_lock_jnode(node);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+ }
+ BUG_ON(node->atom == NULL);
+ JF_CLR(node, JNODE_WRITE_PREPARED);
+
+ if (get_current_context()->entd) {
+ entd_context *ent = get_entd_context(inode->i_sb);
+
+ if (ent->cur_request->page == page)
+ /* the following reference will be
+ dropped in reiser4_writeout */
+ ent->cur_request->node = jref(node);
+ }
+ jput(node);
+ return 0;
+}
+
+/**
+ * has_anonymous_pages - check whether inode has pages dirtied via mmap
+ * @inode: inode to check
+ *
+ * Returns true if inode's mapping has dirty pages which do not belong to any
+ * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
+ * tree or were eflushed and can be found via jnodes tagged
+ * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
+ */
+static int has_anonymous_pages(struct inode *inode)
+{
+ int result;
+
+ xa_lock_irq(&inode->i_mapping->i_pages);
+ result = radix_tree_tagged(&inode->i_mapping->i_pages,
+ PAGECACHE_TAG_REISER4_MOVED);
+ xa_unlock_irq(&inode->i_mapping->i_pages);
+ return result;
+}
+
+/**
+ * @page: page to be captured
+ */
+static int reserve_capture_anon_page(void)
+{
+ /*
+ * page capture may require extent creation (if it does not exist yet)
+ * and stat data's update (number of blocks changes on extent creation)
+ */
+ grab_space_enable();
+ return reiser4_grab_space(1 +
+ 2 * estimate_one_insert_into_item(meta_subvol_tree()),
+ BA_CAN_COMMIT, get_meta_subvol());
+}
+
+/*
+ * Support for "anonymous" pages and jnodes.
+ *
+ * When file is write-accessed through mmap pages can be dirtied from the user
+ * level. In this case kernel is not notified until one of following happens:
+ *
+ * (1) msync()
+ *
+ * (2) truncate() (either explicit or through unlink)
+ *
+ * (3) VM scanner starts reclaiming mapped pages, dirtying them before
+ * starting write-back.
+ *
+ * As a result of (3) ->writepage may be called on a dirty page without
+ * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
+ * (iozone) generate huge number of anonymous pages.
+ *
+ * reiser4_sync_sb() method tries to insert anonymous pages into
+ * tree. This is done by capture_anonymous_*() functions below.
+ */
+
+/**
+ * involve page into transaction
+ * @pg: page to deal with
+ *
+ * Takes care that @page has corresponding metadata in the tree;
+ * creates jnode for @page and captures it. On success 1 is returned.
+ * Exclusive, or non-exclusive lock must be held.
+ */
+static int capture_anon_page(struct page *page)
+{
+ int ret;
+ struct inode *inode;
+
+ if (PageWriteback(page))
+ /*
+ * FIXME: do nothing?
+ */
+ return 0;
+ assert("vs-1084", page->mapping && page->mapping->host);
+
+ inode = page->mapping->host;
+
+ assert("vs-1139",
+ unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
+ assert("vs-1393", inode->i_size > page_offset(page));
+
+ ret = reserve_capture_anon_page();
+ if (ret)
+ return ret;
+ ret = find_or_create_extent_unix_file(page);
+ if (ret) {
+ SetPageError(page);
+ warning("nikita-3329",
+ "Cannot capture anon page: %i", ret);
+ } else
+ ret = 1;
+ return ret;
+}
+
+/**
+ * capture_anonymous_pages - find and capture pages dirtied via mmap
+ * @mapping: address space where to look for pages
+ * @index: start index
+ * @to_capture: maximum number of pages to capture
+ * @capture_anon_page_fn: method to capture one anonymous page
+ *
+ * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
+ * captures (involves into atom) them, returns number of captured pages,
+ * updates @index to next page after the last captured one.
+ */
+static int capture_anon_pages(struct address_space *mapping,
+ pgoff_t *index, unsigned int to_capture,
+ int(*capture_anon_page_fn)(struct page *))
+{
+ int result;
+ struct pagevec pvec;
+ unsigned int i, count;
+ int nr;
+
+ pagevec_init(&pvec);
+ count = min(pagevec_space(&pvec), to_capture);
+ nr = 0;
+
+ /* find pages tagged MOVED */
+ xa_lock_irq(&mapping->i_pages);
+ pvec.nr = radix_tree_gang_lookup_tag(&mapping->i_pages,
+ (void **)pvec.pages, *index, count,
+ PAGECACHE_TAG_REISER4_MOVED);
+ if (pagevec_count(&pvec) == 0) {
+ /*
+ * there are no pages tagged MOVED in mapping->page_tree
+ * starting from *index
+ */
+ xa_unlock_irq(&mapping->i_pages);
+ *index = (pgoff_t)-1;
+ return 0;
+ }
+
+ /* clear MOVED tag for all found pages */
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ get_page(pvec.pages[i]);
+ radix_tree_tag_clear(&mapping->i_pages, pvec.pages[i]->index,
+ PAGECACHE_TAG_REISER4_MOVED);
+ }
+ xa_unlock_irq(&mapping->i_pages);
+
+
+ *index = pvec.pages[i - 1]->index + 1;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ result = capture_anon_page_fn(pvec.pages[i]);
+ if (result == 1)
+ nr++;
+ else {
+ if (result < 0) {
+ warning("vs-1454",
+ "failed to capture page: "
+ "result=%d, captured=%d)\n",
+ result, i);
+
+ /*
+ * set MOVED tag to all pages which left not
+ * captured
+ */
+ xa_lock_irq(&mapping->i_pages);
+ for (; i < pagevec_count(&pvec); i ++) {
+ radix_tree_tag_set(&mapping->i_pages,
+ pvec.pages[i]->index,
+ PAGECACHE_TAG_REISER4_MOVED);
+ }
+ xa_unlock_irq(&mapping->i_pages);
+
+ pagevec_release(&pvec);
+ return result;
+ } else {
+ /*
+ * result == 0. capture_anonymous_page returns
+ * 0 for Writeback-ed page. Set MOVED tag on
+ * that page
+ */
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages,
+ pvec.pages[i]->index,
+ PAGECACHE_TAG_REISER4_MOVED);
+ xa_unlock_irq(&mapping->i_pages);
+ if (i == 0)
+ *index = pvec.pages[0]->index;
+ else
+ *index = pvec.pages[i - 1]->index + 1;
+ }
+ }
+ }
+ pagevec_release(&pvec);
+ return nr;
+}
+
+/**
+ * capture_anonymous_jnodes - find and capture anonymous jnodes
+ * @mapping: address space where to look for jnodes
+ * @from: start index
+ * @to: end index
+ * @to_capture: maximum number of jnodes to capture
+ *
+ * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
+ * the range of indexes @from-@to and captures them, returns number of captured
+ * jnodes, updates @from to next jnode after the last captured one.
+ */
+static int capture_anon_jnodes(struct address_space *mapping,
+ pgoff_t *from, pgoff_t to, int to_capture)
+{
+ *from = to;
+ return 0;
+}
+
+/*
+ * Commit atom of the jnode of a page.
+ */
+int reiser4_sync_page(struct page *page)
+{
+ int result;
+ do {
+ jnode *node;
+ txn_atom *atom;
+
+ lock_page(page);
+ node = jprivate(page);
+ if (node != NULL) {
+ spin_lock_jnode(node);
+ atom = jnode_get_atom(node);
+ spin_unlock_jnode(node);
+ } else
+ atom = NULL;
+ unlock_page(page);
+ result = reiser4_sync_atom(atom);
+ } while (result == -E_REPEAT);
+ /*
+ * ZAM-FIXME-HANS: document the logic of this loop, is it just to
+ * handle the case where more pages get added to the atom while we are
+ * syncing it?
+ */
+ assert("nikita-3485", ergo(result == 0,
+ get_current_context()->trans->atom == NULL));
+ return result;
+}
+
+/*
+ * Commit atoms of pages on @pages list.
+ * call sync_page for each page from mapping's page tree
+ */
+int reiser4_sync_page_list(struct inode *inode)
+{
+ int result;
+ struct address_space *mapping;
+ unsigned long from; /* start index for radix_tree_gang_lookup */
+ unsigned int found; /* return value for radix_tree_gang_lookup */
+
+ mapping = inode->i_mapping;
+ from = 0;
+ result = 0;
+
+ xa_lock_irq(&mapping->i_pages);
+ while (result == 0) {
+ struct page *page;
+
+ found = radix_tree_gang_lookup(&mapping->i_pages,
+ (void **)&page, from, 1);
+ assert("edward-1550", found < 2);
+ if (found == 0)
+ break;
+ /**
+ * page may not leave radix tree because it is protected from
+ * truncating by inode->i_mutex locked by sys_fsync
+ */
+ get_page(page);
+ xa_unlock_irq(&mapping->i_pages);
+
+ from = page->index + 1;
+
+ result = reiser4_sync_page(page);
+
+ put_page(page);
+ xa_lock_irq(&mapping->i_pages);
+ }
+ xa_unlock_irq(&mapping->i_pages);
+ return result;
+}
+
+static int commit_file_atoms(struct inode *inode)
+{
+ int result;
+ struct unix_file_info *uf_info;
+
+ uf_info = unix_file_inode_data(inode);
+
+ get_exclusive_access(uf_info);
+ /*
+ * find what items file is made from
+ */
+ result = find_file_state(inode, uf_info);
+ drop_exclusive_access(uf_info);
+ if (result != 0)
+ return result;
+
+ /*
+ * file state cannot change because we are under ->i_mutex
+ */
+ switch (uf_info->container) {
+ case UF_CONTAINER_EXTENTS:
+ /* find_file_state might open join an atom */
+ reiser4_txn_restart_current();
+ result =
+ /*
+ * when we are called by
+ * filemap_fdatawrite->
+ * do_writepages()->
+ * reiser4_writepages_dispatch()
+ *
+ * inode->i_mapping->dirty_pages are spices into
+ * ->io_pages, leaving ->dirty_pages dirty.
+ *
+ * When we are called from
+ * reiser4_fsync()->sync_unix_file(), we have to
+ * commit atoms of all pages on the ->dirty_list.
+ *
+ * So for simplicity we just commit ->io_pages and
+ * ->dirty_pages.
+ */
+ reiser4_sync_page_list(inode);
+ break;
+ case UF_CONTAINER_TAILS:
+ /*
+ * NOTE-NIKITA probably we can be smarter for tails. For now
+ * just commit all existing atoms.
+ */
+ result = txnmgr_force_commit_all(inode->i_sb, 0);
+ break;
+ case UF_CONTAINER_EMPTY:
+ result = 0;
+ break;
+ case UF_CONTAINER_UNKNOWN:
+ default:
+ result = -EIO;
+ break;
+ }
+
+ /*
+ * commit current transaction: there can be captured nodes from
+ * find_file_state() and finish_conversion().
+ */
+ reiser4_txn_restart_current();
+ return result;
+}
+
+/**
+ * This captures anonymous pages and anonymous jnodes. Anonymous pages are
+ * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
+ * created by reiser4_writepage.
+ */
+int reiser4_writepages_generic(struct address_space *mapping,
+ struct writeback_control *wbc,
+ int(*capture_anon_page_fn)(struct page *),
+ int(*commit_file_atoms_fn)(struct inode *))
+{
+ int result;
+ struct unix_file_info *uf_info;
+ pgoff_t pindex, jindex, nr_pages;
+ long to_capture;
+ struct inode *inode;
+
+ inode = mapping->host;
+ if (!has_anonymous_pages(inode)) {
+ result = 0;
+ goto end;
+ }
+ jindex = pindex = wbc->range_start >> PAGE_SHIFT;
+ result = 0;
+ nr_pages = size_in_pages(i_size_read(inode));
+
+ uf_info = unix_file_inode_data(inode);
+
+ do {
+ reiser4_context *ctx;
+
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
+ else
+ to_capture = CAPTURE_APAGE_BURST;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ result = PTR_ERR(ctx);
+ break;
+ }
+ /* avoid recursive calls to ->sync_inodes */
+ ctx->nobalance = 1;
+ assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
+ assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
+ assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
+
+ reiser4_txn_restart_current();
+
+ /* we have to get nonexclusive access to the file */
+ if (get_current_context()->entd) {
+ /*
+ * use nonblocking version of nonexclusive_access to
+ * avoid deadlock which might look like the following:
+ * process P1 holds NEA on file F1 and called entd to
+ * reclaim some memory. Entd works for P1 and is going
+ * to capture pages of file F2. To do that entd has to
+ * get NEA to F2. F2 is held by process P2 which also
+ * called entd. But entd is serving P1 at the moment
+ * and P2 has to wait. Process P3 trying to get EA to
+ * file F2. Existence of pending EA request to file F2
+ * makes impossible for entd to get NEA to file
+ * F2. Neither of these process can continue. Using
+ * nonblocking version of gettign NEA is supposed to
+ * avoid this deadlock.
+ */
+ if (try_to_get_nonexclusive_access(uf_info) == 0) {
+ result = RETERR(-EBUSY);
+ reiser4_exit_context(ctx);
+ break;
+ }
+ } else
+ get_nonexclusive_access(uf_info);
+
+ while (to_capture > 0) {
+ pgoff_t start;
+
+ assert("vs-1727", jindex <= pindex);
+ if (pindex == jindex) {
+ start = pindex;
+ result = capture_anon_pages(inode->i_mapping,
+ &pindex,
+ to_capture,
+ capture_anon_page_fn);
+ if (result <= 0)
+ break;
+ to_capture -= result;
+ wbc->nr_to_write -= result;
+ if (start + result == pindex) {
+ jindex = pindex;
+ continue;
+ }
+ if (to_capture <= 0)
+ break;
+ }
+ /*
+ * deal with anonymous jnodes between jindex and pindex
+ */
+ result = capture_anon_jnodes(inode->i_mapping,
+ &jindex,
+ pindex, to_capture);
+ if (result < 0)
+ break;
+ to_capture -= result;
+ get_current_context()->nr_captured += result;
+
+ if (jindex == (pgoff_t) - 1) {
+ assert("vs-1728", pindex == (pgoff_t) - 1);
+ break;
+ }
+ }
+ if (to_capture <= 0)
+ /* there may be left more pages */
+ __mark_inode_dirty(inode, I_DIRTY_PAGES);
+
+ drop_nonexclusive_access(uf_info);
+ if (result < 0) {
+ /* error happened */
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ if (wbc->sync_mode != WB_SYNC_ALL) {
+ reiser4_exit_context(ctx);
+ return 0;
+ }
+ result = commit_file_atoms_fn(inode);
+ reiser4_exit_context(ctx);
+ if (pindex >= nr_pages && jindex == pindex)
+ break;
+ } while (1);
+
+ end:
+ if (is_in_reiser4_context()) {
+ if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
+ /*
+ * there are already pages to flush, flush them out, do
+ * not delay until end of reiser4_sync_inodes
+ */
+ reiser4_writeout(inode->i_sb, wbc);
+ get_current_context()->nr_captured = 0;
+ }
+ }
+ return result;
+}
+
+int writepages_unix_file(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return reiser4_writepages_generic(mapping, wbc,
+ capture_anon_page,
+ commit_file_atoms);
+}
+
+/**
+ * ->readpage() method of address space operations for unix-file plugin
+ */
+int readpage_unix_file(struct file *file, struct page *page)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *inode;
+ reiser4_key key;
+ hint_t *hint;
+ lock_handle *lh;
+ coord_t *coord;
+
+ assert("vs-1062", PageLocked(page));
+ assert("vs-976", !PageUptodate(page));
+ assert("vs-1061", page->mapping && page->mapping->host);
+
+ inode = page->mapping->host;
+
+ assert("edward-2087", inode_file_plugin(inode) ==
+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
+
+ if (inode->i_size <= page_offset(page)) {
+ /* page is out of file */
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ unlock_page(page);
+ return PTR_ERR(ctx);
+ }
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL) {
+ unlock_page(page);
+ reiser4_exit_context(ctx);
+ return RETERR(-ENOMEM);
+ }
+
+ result = load_file_hint(file, hint);
+ if (result) {
+ kfree(hint);
+ unlock_page(page);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ lh = &hint->lh;
+ /*
+ * construct key of the page's first byte
+ */
+ build_body_key_unix_file(inode, page_offset(page), &key);
+ /*
+ * look for file metadata corresponding to the page's first byte
+ */
+ get_page(page);
+ unlock_page(page);
+ result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
+ lock_page(page);
+ put_page(page);
+
+ if (page->mapping == NULL) {
+ /*
+ * readpage allows truncate to run concurrently.
+ * Page was truncated while it was not locked
+ */
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return -EINVAL;
+ }
+ if (result != CBK_COORD_FOUND ||
+ hint->ext_coord.coord.between != AT_UNIT) {
+
+ if (result == CBK_COORD_FOUND &&
+ hint->ext_coord.coord.between != AT_UNIT)
+ /* file is truncated */
+ result = -EINVAL;
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ /*
+ * item corresponding to page is found.
+ * It can not be removed because znode lock is held
+ */
+ if (PageUptodate(page)) {
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return 0;
+ }
+ coord = &hint->ext_coord.coord;
+ result = zload(coord->node);
+ if (result) {
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ validate_extended_coord(&hint->ext_coord, page_offset(page));
+
+ if (!coord_is_existing_unit(coord)) {
+ /* this indicates corruption */
+ warning("vs-280",
+ "Looking for page %lu of file %llu (size %lli). "
+ "No file items found (%d). File is corrupted?\n",
+ page->index, (unsigned long long)get_inode_oid(inode),
+ inode->i_size, result);
+ zrelse(coord->node);
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return RETERR(-EIO);
+ }
+ switch(item_plugin_by_coord(coord)->h.id) {
+ case EXTENT40_POINTER_ID:
+ result = reiser4_readpage_extent(coord, page);
+ break;
+ case FORMATTING_ID:
+ result = readpage_tail_unix_file(coord, page);
+ break;
+ default:
+ result = RETERR(-EINVAL);
+ }
+ if (!result) {
+ set_key_offset(&key,
+ (loff_t) (page->index + 1) << PAGE_SHIFT);
+ /* FIXME should call reiser4_set_hint() */
+ reiser4_unset_hint(hint);
+ } else {
+ unlock_page(page);
+ reiser4_unset_hint(hint);
+ }
+ assert("vs-979",
+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
+ assert("vs-9791", ergo(result != 0, !PageLocked(page)));
+
+ zrelse(coord->node);
+ done_lh(lh);
+
+ save_file_hint(file, hint);
+ kfree(hint);
+
+ /*
+ * FIXME: explain why it is needed. HINT: page allocation in write can
+ * not be done when atom is not NULL because reiser4_writepage can not
+ * kick entd and have to eflush
+ */
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+struct uf_readpages_context {
+ lock_handle lh;
+ coord_t coord;
+};
+
+/**
+ * A callback function for readpages_unix_file/read_cache_pages.
+ * We don't take non-exclusive access. If an item different from
+ * extent pointer is found in some iteration, then return error
+ * (-EINVAL).
+ *
+ * FIXME-EDWARD: This function is suboptimal. We can collect information
+ * about the next unit/item in the node to save twig lock and hence to
+ * reduce a number of tree searches
+ *
+ * @data -- a pointer to reiser4_readpages_context object, to save the
+ * twig lock and the coord between read_cache_page iterations.
+ * @page -- page to start read against;
+ * @striped -- if true, then filler is called by striped file plugin.
+ */
+int reiser4_readpages_filler_generic(void *data,
+ struct page *page, int striped)
+{
+ int ret = 0;
+ reiser4_extent *ext;
+ __u64 ext_index;
+ int cbk_done = 0;
+ struct uf_readpages_context *rc = data;
+ struct address_space *mapping = page->mapping;
+ file_plugin *fplug = inode_file_plugin(mapping->host);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return 0;
+ }
+ get_page(page);
+
+ if (rc->lh.node == 0) {
+ /* no twig lock - have to do tree search. */
+ reiser4_key key;
+ repeat:
+ unlock_page(page);
+
+ fplug->build_body_key(mapping->host,
+ page_offset(page), &key);
+
+ ret = coord_by_key(meta_subvol_tree(),
+ &key, &rc->coord, &rc->lh,
+ ZNODE_READ_LOCK, FIND_EXACT,
+ TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
+ if (unlikely(ret))
+ goto exit;
+ lock_page(page);
+ if (PageUptodate(page))
+ goto unlock;
+ cbk_done = 1;
+ }
+ ret = zload(rc->coord.node);
+ if (unlikely(ret))
+ goto unlock;
+ if (!coord_is_existing_unit(&rc->coord)) {
+ /*
+ * extent pointer representing that block
+ * of data not found
+ */
+ if (striped) {
+ /* hole in a file */
+ ret = __reiser4_readpage_extent(NULL, NULL, 0, page);
+ zrelse(rc->coord.node);
+ done_lh(&rc->lh);
+ goto exit;
+ } else {
+ zrelse(rc->coord.node);
+ ret = RETERR(-ENOENT);
+ goto unlock;
+ }
+ } else if (!item_is_extent(&rc->coord)) {
+ /*
+ * ->readpages() is not defined for tail items
+ */
+ zrelse(rc->coord.node);
+ ret = RETERR(-EINVAL);
+ goto unlock;
+ }
+ ext = extent_by_coord(&rc->coord);
+ ext_index = extent_unit_index(&rc->coord);
+
+ if (page->index < ext_index ||
+ page->index >= ext_index + extent_get_width(ext)) {
+ /*
+ * the page index doesn't belong to the extent unit
+ * which the coord points to - release the lock and
+ * repeat with tree search
+ */
+ zrelse(rc->coord.node);
+ done_lh(&rc->lh);
+ /*
+ * we can be here after a CBK call only in case of
+ * corruption of the tree or the tree lookup
+ * algorithm bug
+ */
+ if (unlikely(cbk_done)) {
+ ret = RETERR(-EIO);
+ goto unlock;
+ }
+ goto repeat;
+ }
+ ret = __reiser4_readpage_extent(&rc->coord,
+ ext, page->index - ext_index,
+ page);
+ zrelse(rc->coord.node);
+ if (likely(!ret))
+ goto exit;
+ unlock:
+ unlock_page(page);
+ exit:
+ put_page(page);
+ return ret;
+}
+
+static inline int readpages_filler_uf(void *data, struct page *page)
+{
+ return reiser4_readpages_filler_generic(data, page, 0);
+}
+
+/**
+ * readpages_unix_file - called by the readahead code, starts reading for each
+ * page of given list of pages
+ */
+int reiser4_readpages_generic(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ int (*filler)(void *data, struct page *page))
+{
+ reiser4_context *ctx;
+ struct uf_readpages_context rc;
+ int ret;
+
+ ctx = reiser4_init_context(mapping->host->i_sb);
+ if (IS_ERR(ctx)) {
+ put_pages_list(pages);
+ return PTR_ERR(ctx);
+ }
+ init_lh(&rc.lh);
+ ret = read_cache_pages(mapping, pages, filler, &rc);
+ done_lh(&rc.lh);
+
+ context_set_commit_async(ctx);
+ /* close the transaction to protect further page allocation from deadlocks */
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return ret;
+}
+
+int readpages_unix_file(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return reiser4_readpages_generic(file, mapping, pages, nr_pages,
+ readpages_filler_uf);
+}
+
+/* this is called with nonexclusive access obtained,
+ file's container can not change */
+static ssize_t do_read_compound_file(hint_t *hint, struct file *file,
+ char __user *buf, size_t count,
+ loff_t *off)
+{
+ int result;
+ struct inode *inode;
+ flow_t flow;
+ coord_t *coord;
+ znode *loaded;
+
+ inode = file_inode(file);
+
+ /* build flow */
+ result = flow_by_inode_unix_file(inode, buf, 1 /* user space */,
+ count, *off, READ_OP, &flow);
+ if (unlikely(result))
+ return result;
+
+ /* get seal and coord sealed with it from reiser4 private data
+ of struct file. The coord will tell us where our last read
+ of this file finished, and the seal will help to determine
+ if that location is still valid.
+ */
+ coord = &hint->ext_coord.coord;
+ while (flow.length && result == 0) {
+ result = find_file_item(hint, &flow.key,
+ ZNODE_READ_LOCK, inode);
+ if (cbk_errored(result))
+ /* error happened */
+ break;
+
+ if (coord->between != AT_UNIT) {
+ /* there were no items corresponding to given offset */
+ done_lh(hint->ext_coord.lh);
+ break;
+ }
+
+ loaded = coord->node;
+ result = zload(loaded);
+ if (unlikely(result)) {
+ done_lh(hint->ext_coord.lh);
+ break;
+ }
+
+ if (hint->ext_coord.valid == 0)
+ validate_extended_coord(&hint->ext_coord,
+ get_key_offset(&flow.key));
+
+ assert("vs-4", hint->ext_coord.valid == 1);
+ assert("vs-33", hint->ext_coord.lh == &hint->lh);
+
+ switch(item_plugin_by_coord(coord)->h.id) {
+ case EXTENT40_POINTER_ID:
+ result = read_extent_unix_file(file, &flow, hint);
+ break;
+ case FORMATTING_ID:
+ result = read_tail_unix_file(file, &flow, hint);
+ break;
+ default:
+ result = RETERR(-EINVAL);
+ }
+ zrelse(loaded);
+ done_lh(hint->ext_coord.lh);
+ }
+ return (count - flow.length) ? (count - flow.length) : result;
+}
+
+static ssize_t read_compound_file(struct file*, char __user*, size_t, loff_t*);
+
+/**
+ * unix-file specific ->read() method
+ * of struct file_operations.
+ */
+ssize_t read_unix_file(struct file *file, char __user *buf,
+ size_t read_amount, loff_t *off)
+{
+ reiser4_context *ctx;
+ ssize_t result;
+ struct inode *inode;
+ struct unix_file_info *uf_info;
+
+ if (unlikely(read_amount == 0))
+ return 0;
+
+ inode = file_inode(file);
+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = reserve_update_sd_common(inode);
+ if (unlikely(result != 0))
+ goto out2;
+
+ uf_info = unix_file_inode_data(inode);
+
+ if (uf_info->container == UF_CONTAINER_UNKNOWN) {
+ get_exclusive_access(uf_info);
+ result = find_file_state(inode, uf_info);
+ if (unlikely(result != 0))
+ goto out;
+ }
+ else
+ get_nonexclusive_access(uf_info);
+
+ switch (uf_info->container) {
+ case UF_CONTAINER_EXTENTS:
+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
+ result = new_sync_read(file, buf, read_amount, off);
+ break;
+ }
+ /* fall through */
+ case UF_CONTAINER_TAILS:
+ case UF_CONTAINER_UNKNOWN:
+ result = read_compound_file(file, buf, read_amount, off);
+ break;
+ case UF_CONTAINER_EMPTY:
+ result = 0;
+ }
+ out:
+ drop_access(uf_info);
+ out2:
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/*
+ * Read a file, which contains tails and, maybe,
+ * extents.
+ *
+ * Sometimes file can consist of items of both types
+ * (extents and tails). It can happen, e.g. because
+ * of failed tail conversion. Also the conversion code
+ * may release exclusive lock before calling
+ * balance_dirty_pages().
+ *
+ * In this case applying a generic VFS library function
+ * would be suboptimal. We use our own "light-weigth"
+ * version below.
+ */
+static ssize_t read_compound_file(struct file *file, char __user *buf,
+ size_t count, loff_t *off)
+{
+ ssize_t result = 0;
+ struct inode *inode;
+ hint_t *hint;
+ struct unix_file_info *uf_info;
+ size_t to_read;
+ size_t was_read = 0;
+ loff_t i_size;
+
+ inode = file_inode(file);
+ assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ i_size = i_size_read(inode);
+ if (*off >= i_size)
+ /* position to read from is past the end of file */
+ goto exit;
+ if (*off + count > i_size)
+ count = i_size - *off;
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL)
+ return RETERR(-ENOMEM);
+
+ result = load_file_hint(file, hint);
+ if (result) {
+ kfree(hint);
+ return result;
+ }
+ uf_info = unix_file_inode_data(inode);
+
+ /* read by page-aligned chunks */
+ to_read = PAGE_SIZE - (*off & (loff_t)(PAGE_SIZE - 1));
+ if (to_read > count)
+ to_read = count;
+ while (count > 0) {
+ reiser4_txn_restart_current();
+ /*
+ * faultin user page
+ */
+ result = fault_in_pages_writeable(buf, to_read);
+ if (result)
+ return RETERR(-EFAULT);
+
+ result = do_read_compound_file(hint, file, buf, to_read, off);
+ if (result < 0)
+ break;
+ count -= result;
+ buf += result;
+
+ /* update position in a file */
+ *off += result;
+ /* total number of read bytes */
+ was_read += result;
+ to_read = count;
+ if (to_read > PAGE_SIZE)
+ to_read = PAGE_SIZE;
+ }
+ done_lh(&hint->lh);
+ save_file_hint(file, hint);
+ kfree(hint);
+ if (was_read)
+ file_accessed(file);
+ exit:
+ return was_read ? was_read : result;
+}
+
+/* This function takes care about @file's pages. First of all it checks if
+ filesystems readonly and if so gets out. Otherwise, it throws out all
+ pages of file if it was mapped for read and going to be mapped for write
+ and consists of tails. This is done in order to not manage few copies
+ of the data (first in page cache and second one in tails them selves)
+ for the case of mapping files consisting tails.
+
+ Here also tail2extent conversion is performed if it is allowed and file
+ is going to be written or mapped for write. This functions may be called
+ from write_unix_file() or mmap_unix_file(). */
+static int check_pages_unix_file(struct file *file, struct inode *inode)
+{
+ reiser4_invalidate_pages(inode->i_mapping, 0,
+ (inode->i_size + PAGE_SIZE -
+ 1) >> PAGE_SHIFT, 0);
+ return unpack(file, inode, 0 /* not forever */ );
+}
+
+/**
+ * mmap_unix_file - mmap of struct file_operations
+ * @file: file to mmap
+ * @vma:
+ *
+ * This is implementation of vfs's mmap method of struct file_operations for
+ * unix file plugin. It converts file to extent if necessary. Sets
+ * reiser4_inode's flag - REISER4_HAS_MMAP.
+ */
+int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *inode;
+ struct unix_file_info *uf_info;
+
+ inode = file_inode(file);
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ uf_info = unix_file_inode_data(inode);
+
+ get_exclusive_access_careful(uf_info, inode);
+
+ if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
+ /*
+ * we need file built of extent items. If it is still built of
+ * tail items we have to convert it. Find what items the file
+ * is built of
+ */
+ result = find_file_state(inode, uf_info);
+ if (result != 0) {
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
+ uf_info->container == UF_CONTAINER_EXTENTS ||
+ uf_info->container == UF_CONTAINER_EMPTY));
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ /*
+ * invalidate all pages and convert file from tails to
+ * extents
+ */
+ result = check_pages_unix_file(file, inode);
+ if (result) {
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ }
+ }
+ /*
+ * generic_file_mmap will do update_atime.
+ * Grab space for stat data update.
+ */
+ result = reserve_update_sd_common(inode);
+ if (result) {
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ result = generic_file_mmap(file, vma);
+ if (result == 0) {
+ /* mark file as having mapping. */
+ reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
+ }
+
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/**
+ * find_first_item
+ * @inode:
+ *
+ * Finds file item which is responsible for first byte in the file.
+ */
+static int find_first_item(struct inode *inode)
+{
+ coord_t coord;
+ lock_handle lh;
+ reiser4_key key;
+ int result;
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+ inode_file_plugin(inode)->build_body_key(inode, 0, &key);
+ result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
+ inode);
+ if (result == CBK_COORD_FOUND) {
+ if (coord.between == AT_UNIT) {
+ result = zload(coord.node);
+ if (result == 0) {
+ result = item_id_by_coord(&coord);
+ zrelse(coord.node);
+ if (result != EXTENT40_POINTER_ID &&
+ result != FORMATTING_ID)
+ result = RETERR(-EIO);
+ }
+ } else
+ result = RETERR(-EIO);
+ }
+ done_lh(&lh);
+ return result;
+}
+
+/**
+ * open_unix_file
+ * @inode:
+ * @file:
+ *
+ * If filesystem is not readonly - complete uncompleted tail conversion if
+ * there was one
+ */
+int open_unix_file(struct inode *inode, struct file *file)
+{
+ int result;
+ reiser4_context *ctx;
+ struct unix_file_info *uf_info;
+
+ if (IS_RDONLY(inode))
+ return 0;
+
+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
+ return 0;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ uf_info = unix_file_inode_data(inode);
+
+ get_exclusive_access_careful(uf_info, inode);
+
+ if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
+ /*
+ * other process completed the conversion
+ */
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return 0;
+ }
+
+ /*
+ * file left in semi converted state after unclean shutdown or another
+ * thread is doing conversion and dropped exclusive access which doing
+ * balance dirty pages. Complete the conversion
+ */
+ result = find_first_item(inode);
+ if (result == EXTENT40_POINTER_ID)
+ /*
+ * first item is extent, therefore there was incomplete
+ * tail2extent conversion. Complete it
+ */
+ result = tail2extent(unix_file_inode_data(inode));
+ else if (result == FORMATTING_ID)
+ /*
+ * first item is formatting item, therefore there was
+ * incomplete extent2tail conversion. Complete it
+ */
+ result = extent2tail(file, unix_file_inode_data(inode));
+ else
+ result = -EIO;
+
+ assert("vs-1712",
+ ergo(result == 0,
+ (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
+ !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
+ drop_exclusive_access(uf_info);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+#define NEITHER_OBTAINED 0
+#define EA_OBTAINED 1
+#define NEA_OBTAINED 2
+
+static void drop_access(struct unix_file_info *uf_info)
+{
+ if (uf_info->exclusive_use)
+ drop_exclusive_access(uf_info);
+ else
+ drop_nonexclusive_access(uf_info);
+}
+
+#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
+ __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
+
+/**
+ * write_unix_file - private ->write() method of unix_file plugin.
+ *
+ * @file: file to write to
+ * @buf: address of user-space buffer
+ * @count: number of bytes to write
+ * @pos: position in file to write to
+ * @cont: unused argument, as we don't perform plugin conversion when being
+ * managed by unix_file plugin.
+ */
+ssize_t write_unix_file(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *pos,
+ struct dispatch_context *cont)
+{
+ int result;
+ reiser4_context *ctx;
+ struct inode *inode;
+ struct unix_file_info *uf_info;
+ ssize_t written;
+ int to_write = PAGE_SIZE * DEFAULT_WRITE_GRANULARITY;
+ size_t left;
+ ssize_t (*write_op)(struct file *, struct inode *,
+ const char __user *, size_t,
+ loff_t *pos);
+ int ea;
+ int enospc = 0; /* item plugin ->write() returned ENOSPC */
+ loff_t new_size;
+
+ ctx = get_current_context();
+ inode = file_inode(file);
+
+ assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+ assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
+
+ result = file_remove_privs(file);
+ if (result) {
+ context_set_commit_async(ctx);
+ return result;
+ }
+ /* remove_suid might create a transaction */
+ reiser4_txn_restart(ctx);
+
+ uf_info = unix_file_inode_data(inode);
+
+ written = 0;
+ left = count;
+ ea = NEITHER_OBTAINED;
+ enospc = 0;
+
+ new_size = i_size_read(inode);
+ if (*pos + count > new_size)
+ new_size = *pos + count;
+
+ while (left) {
+ int update_sd = 0;
+ if (left < to_write)
+ to_write = left;
+
+ if (uf_info->container == UF_CONTAINER_EMPTY) {
+ get_exclusive_access(uf_info);
+ ea = EA_OBTAINED;
+ if (uf_info->container != UF_CONTAINER_EMPTY) {
+ /* file is made not empty by another process */
+ drop_exclusive_access(uf_info);
+ ea = NEITHER_OBTAINED;
+ continue;
+ }
+ } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
+ /*
+ * get exclusive access directly just to not have to
+ * re-obtain it if file will appear empty
+ */
+ get_exclusive_access(uf_info);
+ ea = EA_OBTAINED;
+ result = find_file_state(inode, uf_info);
+ if (result) {
+ drop_exclusive_access(uf_info);
+ ea = NEITHER_OBTAINED;
+ break;
+ }
+ } else {
+ get_nonexclusive_access(uf_info);
+ ea = NEA_OBTAINED;
+ }
+
+ /* either EA or NEA is obtained. Choose item write method */
+ if (uf_info->container == UF_CONTAINER_EXTENTS) {
+ /* file is built of extent items */
+ write_op = write_extent_unix_file;
+ } else if (uf_info->container == UF_CONTAINER_EMPTY) {
+ /* file is empty */
+ if (should_have_notail(uf_info, new_size))
+ write_op = write_extent_unix_file;
+ else
+ write_op = write_tail_unix_file;
+ } else {
+ /* file is built of tail items */
+ if (should_have_notail(uf_info, new_size)) {
+ if (ea == NEA_OBTAINED) {
+ drop_nonexclusive_access(uf_info);
+ get_exclusive_access(uf_info);
+ ea = EA_OBTAINED;
+ }
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ /*
+ * if file is being convered by another
+ * process - wait until it completes
+ */
+ while (1) {
+ if (reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV)) {
+ drop_exclusive_access(uf_info);
+ schedule();
+ get_exclusive_access(uf_info);
+ continue;
+ }
+ break;
+ }
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ result = tail2extent(uf_info);
+ if (result) {
+ drop_exclusive_access(uf_info);
+ context_set_commit_async(ctx);
+ break;
+ }
+ }
+ }
+ drop_exclusive_access(uf_info);
+ ea = NEITHER_OBTAINED;
+ continue;
+ }
+ write_op = write_tail_unix_file;
+ }
+
+ written = write_op(file, inode, buf, to_write, pos);
+ if (written == -ENOSPC && !enospc) {
+ drop_access(uf_info);
+ txnmgr_force_commit_all(inode->i_sb, 0);
+ enospc = 1;
+ continue;
+ }
+ if (written < 0) {
+ /*
+ * If this is -ENOSPC, then it happened
+ * second time, so don't try to free space
+ * once again.
+ */
+ drop_access(uf_info);
+ result = written;
+ break;
+ }
+ /* something is written. */
+ if (enospc)
+ enospc = 0;
+ if (uf_info->container == UF_CONTAINER_EMPTY) {
+ assert("edward-1553", ea == EA_OBTAINED);
+ uf_info->container =
+ (write_op == write_extent_unix_file) ?
+ UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
+ }
+ assert("edward-1554",
+ ergo(uf_info->container == UF_CONTAINER_EXTENTS,
+ write_op == write_extent_unix_file));
+ assert("edward-1555",
+ ergo(uf_info->container == UF_CONTAINER_TAILS,
+ write_op == write_tail_unix_file));
+ if (*pos + written > inode->i_size) {
+ INODE_SET_FIELD(inode, i_size, *pos + written);
+ update_sd = 1;
+ }
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_sd = 1;
+ }
+ if (update_sd) {
+ /*
+ * space for update_sd was reserved in write_op
+ */
+ result = reiser4_update_sd(inode);
+ if (result) {
+ warning("edward-1574",
+ "Can not update stat-data: %i. FSCK?",
+ result);
+ drop_access(uf_info);
+ context_set_commit_async(ctx);
+ break;
+ }
+ }
+ drop_access(uf_info);
+ ea = NEITHER_OBTAINED;
+
+ /*
+ * tell VM how many pages were dirtied. Maybe number of pages
+ * which were dirty already should not be counted
+ */
+ reiser4_throttle_write(inode);
+ left -= written;
+ buf += written;
+ *pos += written;
+ }
+ if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ reiser4_txn_restart_current();
+ grab_space_enable();
+ result = reiser4_sync_file_common(file, 0, LONG_MAX,
+ 0 /* data and stat data */);
+ if (result)
+ warning("reiser4-7", "failed to sync file %llu",
+ (unsigned long long)get_inode_oid(inode));
+ }
+ /*
+ * return number of written bytes or error code if nothing is
+ * written. Note, that it does not work correctly in case when
+ * sync_unix_file returns error
+ */
+ return (count - left) ? (count - left) : result;
+}
+
+/**
+ * release_unix_file - release of struct file_operations
+ * @inode: inode of released file
+ * @file: file to release
+ *
+ * Implementation of release method of struct file_operations for unix file
+ * plugin. If last reference to indode is released - convert all extent items
+ * into tail items if necessary. Frees reiser4 specific file data.
+ */
+int release_unix_file(struct inode *inode, struct file *file)
+{
+ reiser4_context *ctx;
+ struct unix_file_info *uf_info;
+ int result;
+ int in_reiser4;
+
+ in_reiser4 = is_in_reiser4_context();
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = 0;
+ if (in_reiser4 == 0) {
+ uf_info = unix_file_inode_data(inode);
+
+ get_exclusive_access_careful(uf_info, inode);
+ if (file->f_path.dentry->d_lockref.count == 1 &&
+ uf_info->container == UF_CONTAINER_EXTENTS &&
+ !should_have_notail(uf_info, inode->i_size) &&
+ !IS_RDONLY(inode)) {
+ result = extent2tail(file, uf_info);
+ if (result != 0) {
+ context_set_commit_async(ctx);
+ warning("nikita-3233",
+ "Failed (%d) to convert in %s (%llu)",
+ result, __FUNCTION__,
+ (unsigned long long)
+ get_inode_oid(inode));
+ }
+ }
+ drop_exclusive_access(uf_info);
+ } else {
+ /*
+ we are within reiser4 context already. How latter is
+ possible? Simple:
+
+ (gdb) bt
+ #0 get_exclusive_access ()
+ #2 0xc01e56d3 in release_unix_file ()
+ #3 0xc01c3643 in reiser4_release ()
+ #4 0xc014cae0 in __fput ()
+ #5 0xc013ffc3 in remove_vm_struct ()
+ #6 0xc0141786 in exit_mmap ()
+ #7 0xc0118480 in mmput ()
+ #8 0xc0133205 in oom_kill ()
+ #9 0xc01332d1 in out_of_memory ()
+ #10 0xc013bc1d in try_to_free_pages ()
+ #11 0xc013427b in __alloc_pages ()
+ #12 0xc013f058 in do_anonymous_page ()
+ #13 0xc013f19d in do_no_page ()
+ #14 0xc013f60e in handle_mm_fault ()
+ #15 0xc01131e5 in do_page_fault ()
+ #16 0xc0104935 in error_code ()
+ #17 0xc025c0c6 in __copy_to_user_ll ()
+ #18 0xc01d496f in reiser4_read_tail ()
+ #19 0xc01e4def in read_unix_file ()
+ #20 0xc01c3504 in reiser4_read ()
+ #21 0xc014bd4f in vfs_read ()
+ #22 0xc014bf66 in sys_read ()
+ */
+ warning("vs-44", "out of memory?");
+ }
+
+ reiser4_free_file_fsdata(file);
+
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+static void set_file_notail(struct inode *inode)
+{
+ reiser4_inode *state;
+ formatting_plugin *tplug;
+
+ state = reiser4_inode_data(inode);
+ tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
+ force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
+}
+
+/* if file is built of tails - convert it to extents */
+static int unpack(struct file *filp, struct inode *inode, int forever)
+{
+ int result = 0;
+ struct unix_file_info *uf_info;
+
+ uf_info = unix_file_inode_data(inode);
+ assert("vs-1628", ea_obtained(uf_info));
+
+ result = find_file_state(inode, uf_info);
+ if (result)
+ return result;
+ assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
+
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ /*
+ * if file is being convered by another process - wait until it
+ * completes
+ */
+ while (1) {
+ if (reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV)) {
+ drop_exclusive_access(uf_info);
+ schedule();
+ get_exclusive_access(uf_info);
+ continue;
+ }
+ break;
+ }
+ if (uf_info->container == UF_CONTAINER_TAILS) {
+ result = tail2extent(uf_info);
+ if (result)
+ return result;
+ }
+ }
+ if (forever) {
+ /* safe new formatting plugin in stat data */
+ __u64 tograb;
+
+ set_file_notail(inode);
+
+ grab_space_enable();
+ tograb = inode_file_plugin(inode)->estimate.update(inode);
+ result = reiser4_grab_space(tograb,
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (result) {
+ warning("edward-1781",
+ "Can not update sd (%d)", result);
+ return result;
+ }
+ result = reiser4_update_sd(inode);
+ }
+
+ return result;
+}
+
+/* implentation of vfs' ioctl method of struct file_operations for unix file
+ plugin
+*/
+int ioctl_unix_file(struct file *filp, unsigned int cmd,
+ unsigned long arg UNUSED_ARG)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ switch (cmd) {
+ case REISER4_IOC_UNPACK:
+ get_exclusive_access(unix_file_inode_data(inode));
+ result = unpack(filp, inode, 1 /* forever */ );
+ drop_exclusive_access(unix_file_inode_data(inode));
+ break;
+
+ default:
+ result = RETERR(-ENOTTY);
+ break;
+ }
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/* implentation of vfs' bmap method of struct address_space_operations for unix
+ file plugin
+*/
+sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
+{
+ reiser4_context *ctx;
+ sector_t result;
+ reiser4_key key;
+ coord_t coord;
+ lock_handle lh;
+ struct inode *inode;
+ item_plugin *iplug;
+ sector_t block;
+
+ inode = mapping->host;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ build_body_key_unix_file(inode,
+ (loff_t) lblock * current_blocksize,
+ &key);
+ init_lh(&lh);
+ result =
+ find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
+ if (cbk_errored(result)) {
+ done_lh(&lh);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ result = zload(coord.node);
+ if (result) {
+ done_lh(&lh);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ iplug = item_plugin_by_coord(&coord);
+ if (iplug->s.file.get_block) {
+ result = iplug->s.file.get_block(&coord, lblock, &block);
+ if (result == 0)
+ result = block;
+ } else
+ result = RETERR(-EINVAL);
+
+ zrelse(coord.node);
+ done_lh(&lh);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+int build_body_key_unix_file(struct inode *inode, loff_t off, reiser4_key *key)
+{
+ build_body_key_common(inode, key);
+ set_key_ordering(key, get_inode_ordering(inode));
+ set_key_offset(key, (__u64) off);
+ return 0;
+}
+
+/**
+ * Construct flow into @flow according to user-supplied data.
+ * This is used by read/write methods to construct a flow to read/write.
+ *
+ * NIKITA-FIXME-HANS: please create statistics on what functions are
+ * dereferenced how often for the mongo benchmark. You can supervise
+ * Elena doing this for you if that helps. Email me the list of the
+ * top 10, with their counts, and an estimate of the total number of
+ * CPU cycles spent dereferencing as a percentage of CPU cycles spent
+ * processing (non-idle processing). If the total percent is, say,
+ * less than 1%, it will make our coding discussions much easier, and
+ * keep me from questioning whether functions like the below are too
+ * frequently called to be dereferenced. If the total percent is more
+ * than 1%, perhaps private methods should be listed in a "required"
+ * comment at the top of each plugin (with stern language about how if
+ * the comment is missing it will not be accepted by the maintainer),
+ * and implemented using macros not dereferenced functions. How about
+ * replacing this whole private methods part of the struct with a
+ * thorough documentation of what the standard helper functions are for
+ * use in constructing plugins? I think users have been asking for
+ * that, though not in so many words.
+ *
+ * flow_by_inode_unix_file - initizlize structure flow
+ * @inode: inode of file for which read or write is abou
+ * @buf: buffer to perform read to or write from
+ * @user: flag showing whether @buf is user space or kernel space
+ * @size: size of buffer @buf
+ * @off: start offset fro read or write
+ * @op: READ or WRITE
+ * @flow:
+ *
+ * Initializes fields of @flow: key, size of data, i/o mode (read or write).
+ */
+int flow_by_inode_unix_file(struct inode *inode,
+ const char __user *buf, int user,
+ loff_t size, loff_t off,
+ rw_op op, flow_t *flow)
+{
+ assert("nikita-1100", inode != NULL);
+
+ flow->length = size;
+ memcpy(&flow->data, &buf, sizeof(buf));
+ flow->user = user;
+ flow->op = op;
+ assert("nikita-1931", inode_file_plugin(inode) != NULL);
+ /*
+ * calculate key of write position and insert it into flow->key
+ */
+ return build_body_key_unix_file(inode, off, &flow->key);
+}
+
+/* plugin->u.file.set_plug_in_sd = NULL
+ plugin->u.file.set_plug_in_inode = NULL
+ plugin->u.file.create_blank_sd = NULL */
+/* plugin->u.file.delete */
+/*
+ plugin->u.file.add_link = reiser4_add_link_common
+ plugin->u.file.rem_link = NULL */
+
+/* plugin->u.file.owns_item
+ this is common_file_owns_item with assertion */
+/* Audited by: green(2002.06.15) */
+int
+owns_item_unix_file(const struct inode *inode /* object to check against */ ,
+ const coord_t * coord /* coord to check */ )
+{
+ int result;
+
+ result = owns_item_common(inode, coord);
+ if (!result)
+ return 0;
+ if (!plugin_of_group(item_plugin_by_coord(coord),
+ FILE_BODY_ITEM_TYPE))
+ return 0;
+ assert("vs-547",
+ item_is_extent(coord) ||
+ item_id_by_coord(coord) == FORMATTING_ID);
+ return 1;
+}
+
+static int setattr_truncate(struct inode *inode, struct iattr *attr,
+ int (*truncate_file_body_fn)(struct inode *,
+ struct iattr *))
+{
+ int result;
+ int s_result;
+ loff_t old_size;
+ struct super_block *super = reiser4_get_current_sb();
+ reiser4_subvol *subv = get_meta_subvol();
+
+ inode_check_scale(inode, inode->i_size, attr->ia_size);
+
+ old_size = inode->i_size;
+
+ result = safe_link_grab(super, BA_CAN_COMMIT, subv);
+ if (result == 0)
+ result = safe_link_add(inode, SAFE_TRUNCATE);
+ if (result == 0)
+ result = truncate_file_body_fn(inode, attr);
+ if (result)
+ warning("vs-1588", "truncate_file failed: oid %lli, "
+ "old size %lld, new size %lld, retval %d",
+ (unsigned long long)get_inode_oid(inode),
+ old_size, attr->ia_size, result);
+
+ s_result = safe_link_grab(super, BA_CAN_COMMIT, subv);
+ if (s_result == 0)
+ s_result = safe_link_del(subv,
+ get_inode_oid(inode),
+ SAFE_TRUNCATE);
+ if (s_result != 0) {
+ warning("nikita-3417", "Cannot kill safelink %lli: %i",
+ (unsigned long long)get_inode_oid(inode), s_result);
+ }
+ safe_link_release(super);
+ return result;
+}
+
+/**
+ * @dentry: object to change attributes;
+ * @attr: change description;
+ * @truncate_body_fn: method of truncating file body
+ */
+int reiser4_setattr_generic(struct dentry *dentry, struct iattr *attr,
+ int (*truncate_file_body_fn)(struct inode *,
+ struct iattr *))
+{
+ int result;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ reiser4_context *ctx;
+ struct unix_file_info *uf_info;
+ /*
+ * truncate does reservation itself and
+ * requires exclusive access obtained
+ */
+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ uf_info = unix_file_inode_data(dentry->d_inode);
+ get_exclusive_access_careful(uf_info, dentry->d_inode);
+ result = setattr_truncate(dentry->d_inode,
+ attr, truncate_file_body_fn);
+ drop_exclusive_access(uf_info);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ } else
+ result = reiser4_setattr_common(dentry, attr);
+
+ return result;
+}
+
+int setattr_unix_file(struct dentry *dentry, struct iattr *attr)
+{
+ return reiser4_setattr_generic(dentry, attr, truncate_body_unix_file);
+}
+
+void init_inode_data_unix_file(struct inode *inode,
+ reiser4_object_create_data *crd,
+ const reiser4_key *sd_key, int create)
+{
+ struct unix_file_info *data;
+
+ data = unix_file_inode_data(inode);
+ data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
+ init_rwsem(&data->latch);
+ data->tplug = inode_formatting_plugin(inode);
+ data->exclusive_use = 0;
+#if REISER4_DEBUG
+ data->ea_owner = NULL;
+ atomic_set(&data->nr_neas, 0);
+#endif
+ init_inode_ordering(inode, crd, sd_key, create);
+}
+
+/**
+ * delete_unix_file - delete_object of file_plugin
+ * @inode: inode to be deleted
+ *
+ * Truncates file to length 0, removes stat data and safe link.
+ */
+int delete_object_unix_file(struct inode *inode)
+{
+ struct unix_file_info *uf_info;
+ int result;
+
+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
+ return 0;
+
+ /* truncate file body first */
+ uf_info = unix_file_inode_data(inode);
+ get_exclusive_access(uf_info);
+ result = shorten_file(inode, 0 /* size */ );
+ drop_exclusive_access(uf_info);
+
+ if (result)
+ warning("edward-1556",
+ "failed to truncate file (%llu) on removal: %d",
+ get_inode_oid(inode), result);
+
+ /* remove stat data and safe link */
+ return reiser4_delete_object_common(inode);
+}
+
+int reiser4_write_begin_common(struct file *file, struct page *page,
+ loff_t pos, unsigned len,
+ int(*readpage_fn)(struct file *, struct page *))
+{
+ int ret;
+ if (len == PAGE_SIZE || PageUptodate(page))
+ return 0;
+
+ ret = readpage_fn(file, page);
+ if (ret) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ /* All reiser4 readpage() implementations should return the
+ * page locked in case of error. */
+ assert("nikita-3472", PageLocked(page));
+ return ret;
+ }
+ /*
+ * ->readpage() either:
+ *
+ * 1. starts IO against @page. @page is locked for IO in
+ * this case.
+ *
+ * 2. doesn't start IO. @page is unlocked.
+ *
+ * In either case, page should be locked.
+ */
+ lock_page(page);
+ /*
+ * IO (if any) is completed at this point. Check for IO
+ * errors.
+ */
+ if (!PageUptodate(page))
+ return RETERR(-EIO);
+ return ret;
+}
+
+/**
+ * Estimate and reserve space needed for write_end_unix_file():
+ * one block for page itself, and one item insertion which may
+ * happen if page corresponds to hole extent and unallocated one
+ * will have to be created.
+ * @inode: object that the page belongs to;
+ * @index: index of the page.
+ */
+static int reserve_write_begin(void)
+{
+ grab_space_enable();
+ return reiser4_grab_space(1 +
+ estimate_one_insert_into_item(meta_subvol_tree()),
+ BA_CAN_COMMIT, get_meta_subvol());
+}
+
+/**
+ * implementation of ->write_begin() address space operation
+ * for unix-file plugin
+ */
+int write_begin_unix_file(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata)
+{
+ int ret;
+ struct inode * inode;
+ struct unix_file_info *info;
+
+ inode = file_inode(file);
+ info = unix_file_inode_data(inode);
+
+ ret = reserve_write_begin();
+ if (ret)
+ return ret;
+ get_exclusive_access(info);
+ ret = find_file_state(file_inode(file), info);
+ if (unlikely(ret != 0)) {
+ drop_exclusive_access(info);
+ return ret;
+ }
+ if (info->container == UF_CONTAINER_TAILS) {
+ ret = tail2extent(info);
+ if (ret) {
+ warning("edward-1575",
+ "tail conversion failed: %d", ret);
+ drop_exclusive_access(info);
+ return ret;
+ }
+ }
+ ret = reiser4_write_begin_common(file, page, pos, len,
+ readpage_unix_file);
+ if (unlikely(ret != 0))
+ drop_exclusive_access(info);
+ /* else exclusive access will be dropped in ->write_end() */
+ return ret;
+}
+
+/**
+ * ->write_end() address space operation for unix-files
+ */
+int write_end_unix_file(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata)
+{
+ int ret;
+ struct inode *inode;
+ struct unix_file_info *info;
+
+ inode = file_inode(file);
+ info = unix_file_inode_data(inode);
+
+ unlock_page(page);
+ ret = find_or_create_extent_unix_file(page);
+ if (ret) {
+ SetPageError(page);
+ goto exit;
+ }
+ if (pos + copied > inode->i_size) {
+ INODE_SET_FIELD(inode, i_size, pos + copied);
+ ret = reiser4_update_sd(inode);
+ if (unlikely(ret != 0))
+ warning("edward-1604",
+ "Can not update stat-data: %i. FSCK?",
+ ret);
+ }
+ exit:
+ drop_exclusive_access(unix_file_inode_data(file_inode(file)));
+ return ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/file_conversion.c linux-5.10.2/fs/reiser4/plugin/file/file_conversion.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/file_conversion.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/file_conversion.c 2020-12-23 16:07:46.124813217 +0100
@@ -0,0 +1,762 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser,
+ licensing governed by reiser4/README */
+
+/**
+ * This file contains dispatching hooks, and conversion methods, which
+ * implement transitions in the FILE interface.
+ *
+ * Dispatching hook makes a decision (at dispatching point) about the
+ * most reasonable plugin. Such decision is made in accordance with some
+ * O(1)-heuristic.
+ *
+ * We implement a transition CRYPTCOMPRESS -> UNIX_FILE for files with
+ * incompressible data. Current heuristic to estimate compressibility is
+ * very simple: if first complete logical cluster (64K by default) of a
+ * file is incompressible, then we make a decision, that the whole file
+ * is incompressible.
+ *
+ * To enable dispatching we install a special "magic" compression mode
+ * plugin CONVX_COMPRESSION_MODE_ID at file creation time.
+ *
+ * Note, that we don't perform back conversion (UNIX_FILE->CRYPTCOMPRESS)
+ * because of compatibility reasons).
+ *
+ * In conversion time we protect CS, the conversion set (file's (meta)data
+ * and plugin table (pset)) via special per-inode rw-semaphore (conv_sem).
+ * The methods which implement conversion are CS writers. The methods of FS
+ * interface (file_operations, inode_operations, address_space_operations)
+ * are CS readers.
+ */
+
+#include <linux/uio.h>
+#include "../../inode.h"
+#include "../cluster.h"
+#include "file.h"
+
+#define conversion_enabled(inode) \
+ (inode_compression_mode_plugin(inode) == \
+ compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
+
+/**
+ * Located sections (readers and writers of @pset) are not permanently
+ * critical: cryptcompress file can be converted only if the conversion
+ * is enabled (see the macrio above). Also we don't perform back
+ * conversion. The following helper macro is a sanity check to decide
+ * if we need the protection (locks are always additional overheads).
+ */
+#define should_protect(inode) \
+ (inode_file_plugin(inode) == \
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) && \
+ conversion_enabled(inode))
+/**
+ * To avoid confusion with read/write file operations, we'll speak about
+ * "passive" protection for FCS readers and "active" protection for FCS
+ * writers. All methods with active or passive protection have suffix
+ * "careful".
+ */
+/**
+ * Macros for passive protection.
+ *
+ * Construct invariant operation to be supplied to VFS.
+ * The macro accepts the following lexemes:
+ * @type - type of the value represented by the compound statement;
+ * @method - name of an operation to be supplied to VFS (reiser4 file
+ * plugin also should contain a method with such name).
+ */
+#define PROT_PASSIVE(type, method, args) \
+({ \
+ type _result; \
+ struct rw_semaphore * guard = \
+ &reiser4_inode_data(inode)->conv_sem; \
+ \
+ if (should_protect(inode)) { \
+ down_read(guard); \
+ if (!should_protect(inode)) \
+ up_read(guard); \
+ } \
+ _result = inode_file_plugin(inode)->method args; \
+ if (should_protect(inode)) \
+ up_read(guard); \
+ _result; \
+})
+
+#define PROT_PASSIVE_VOID(method, args) \
+({ \
+ struct rw_semaphore * guard = \
+ &reiser4_inode_data(inode)->conv_sem; \
+ \
+ if (should_protect(inode)) { \
+ down_read(guard); \
+ if (!should_protect(inode)) \
+ up_read(guard); \
+ } \
+ inode_file_plugin(inode)->method args; \
+ \
+ if (should_protect(inode)) \
+ up_read(guard); \
+})
+
+/* Pass management to the unix-file plugin with "notail" policy */
+static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
+{
+ int result;
+ reiser4_inode *info;
+ struct unix_file_info * uf;
+ info = reiser4_inode_data(inode);
+
+ result = aset_set_unsafe(&info->pset,
+ PSET_FILE,
+ (reiser4_plugin *)
+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
+ if (result)
+ return result;
+ result = aset_set_unsafe(&info->pset,
+ PSET_FORMATTING,
+ (reiser4_plugin *)
+ formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
+ if (result)
+ return result;
+ /* get rid of non-standard plugins */
+ info->plugin_mask &= ~cryptcompress_mask;
+ /* get rid of plugin stat-data extension */
+ info->extmask &= ~(1 << PLUGIN_STAT);
+
+ reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
+
+ /* Init unix-file specific part of inode */
+ uf = unix_file_inode_data(inode);
+ uf->container = UF_CONTAINER_UNKNOWN;
+ init_rwsem(&uf->latch);
+ uf->tplug = inode_formatting_plugin(inode);
+ uf->exclusive_use = 0;
+#if REISER4_DEBUG
+ uf->ea_owner = NULL;
+ atomic_set(&uf->nr_neas, 0);
+#endif
+ /**
+ * we was carefull for file_ops, inode_ops and as_ops
+ * to be invariant for plugin conversion, so there is
+ * no need to update ones already installed in the
+ * vfs's residence.
+ */
+ return 0;
+}
+
+#if REISER4_DEBUG
+static int disabled_conversion_inode_ok(struct inode * inode)
+{
+ __u64 extmask = reiser4_inode_data(inode)->extmask;
+ __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
+
+ return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
+ (extmask & (1 << UNIX_STAT)) &&
+ (extmask & (1 << LARGE_TIMES_STAT)) &&
+ (extmask & (1 << PLUGIN_STAT)) &&
+ (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
+}
+#endif
+
+/**
+ * Disable future attempts to schedule/convert file plugin.
+ * This function is called by plugin schedule hooks.
+ *
+ * To disable conversion we assign any compression mode plugin id
+ * different from CONVX_COMPRESSION_MODE_ID.
+ */
+static int disable_conversion(struct inode * inode)
+{
+ int result;
+ result =
+ force_plugin_pset(inode,
+ PSET_COMPRESSION_MODE,
+ (reiser4_plugin *)compression_mode_plugin_by_id
+ (LATTD_COMPRESSION_MODE_ID));
+ assert("edward-1500",
+ ergo(!result, disabled_conversion_inode_ok(inode)));
+ return result;
+}
+
+/**
+ * Check if we really have achieved plugin scheduling point
+ */
+static int check_dispatch_point(struct inode * inode,
+ loff_t pos /* position in the
+ file to write from */,
+ struct cluster_handle * clust,
+ struct dispatch_context * cont)
+{
+ assert("edward-1505", conversion_enabled(inode));
+ /*
+ * if file size is more then cluster size, then compressible
+ * status must be figured out (i.e. compression was disabled,
+ * or file plugin was converted to unix_file)
+ */
+ assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
+
+ if (pos > inode->i_size)
+ /* first logical cluster will contain a (partial) hole */
+ return disable_conversion(inode);
+ if (pos < inode_cluster_size(inode))
+ /* writing to the first logical cluster */
+ return 0;
+ /*
+ * here we have:
+ * cluster_size <= pos <= i_size <= cluster_size,
+ * and, hence, pos == i_size == cluster_size
+ */
+ assert("edward-1498",
+ pos == inode->i_size &&
+ pos == inode_cluster_size(inode));
+ assert("edward-1539", cont != NULL);
+ assert("edward-1540", cont->state == DISPATCH_INVAL_STATE);
+
+ cont->state = DISPATCH_POINT;
+ return 0;
+}
+
+static void start_check_compressibility(struct inode * inode,
+ struct cluster_handle * clust,
+ hint_t * hint)
+{
+ assert("edward-1507", clust->index == 1);
+ assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
+ assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
+
+ hint_init_zero(hint);
+ clust->hint = hint;
+ clust->index --;
+ clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
+
+ /* first logical cluster (of index #0) must be complete */
+ assert("edward-1510", lbytes(clust->index, inode) ==
+ inode_cluster_size(inode));
+}
+
+static void finish_check_compressibility(struct inode * inode,
+ struct cluster_handle * clust,
+ hint_t * hint)
+{
+ reiser4_unset_hint(clust->hint);
+ clust->hint = hint;
+ clust->index ++;
+}
+
+#if REISER4_DEBUG
+static int prepped_dclust_ok(hint_t * hint)
+{
+ reiser4_key key;
+ coord_t * coord = &hint->ext_coord.coord;
+
+ item_key_by_coord(coord, &key);
+ return (item_id_by_coord(coord) == CTAIL_ID &&
+ !coord_is_unprepped_ctail(coord) &&
+ (get_key_offset(&key) + nr_units_ctail(coord) ==
+ dclust_get_extension_dsize(hint)));
+}
+#endif
+
+#define fifty_persent(size) (size >> 1)
+/* evaluation of data compressibility */
+#define data_is_compressible(osize, isize) \
+ (osize < fifty_persent(isize))
+
+/**
+ * A simple O(1)-heuristic for compressibility.
+ * This is called not more then one time per file's life.
+ * Read first logical cluster (of index #0) and estimate its compressibility.
+ * Save estimation result in @cont.
+ */
+static int read_check_compressibility(struct inode * inode,
+ struct cluster_handle * clust,
+ struct dispatch_context * cont)
+{
+ int i;
+ int result;
+ size_t dst_len;
+ hint_t tmp_hint;
+ hint_t * cur_hint = clust->hint;
+ assert("edward-1541", cont->state == DISPATCH_POINT);
+
+ start_check_compressibility(inode, clust, &tmp_hint);
+
+ reset_cluster_pgset(clust, cluster_nrpages(inode));
+ result = grab_page_cluster(inode, clust, READ_OP);
+ if (result)
+ return result;
+ /* Read page cluster here */
+ for (i = 0; i < clust->nr_pages; i++) {
+ struct page *page = clust->pages[i];
+ lock_page(page);
+ result = do_readpage_ctail(inode, clust, page,
+ ZNODE_READ_LOCK);
+ unlock_page(page);
+ if (result)
+ goto error;
+ }
+ tfm_cluster_clr_uptodate(&clust->tc);
+
+ cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
+
+ if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
+ /* lenght of compressed data is known, no need to compress */
+ assert("edward-1511",
+ znode_is_any_locked(tmp_hint.lh.node));
+ assert("edward-1512",
+ WITH_DATA(tmp_hint.ext_coord.coord.node,
+ prepped_dclust_ok(&tmp_hint)));
+ dst_len = dclust_get_extension_dsize(&tmp_hint);
+ }
+ else {
+ struct tfm_cluster * tc = &clust->tc;
+ compression_plugin * cplug = inode_compression_plugin(inode);
+ result = grab_tfm_stream(inode, tc, INPUT_STREAM);
+ if (result)
+ goto error;
+ for (i = 0; i < clust->nr_pages; i++) {
+ char *data;
+ lock_page(clust->pages[i]);
+ BUG_ON(!PageUptodate(clust->pages[i]));
+ data = kmap(clust->pages[i]);
+ memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
+ data, PAGE_SIZE);
+ kunmap(clust->pages[i]);
+ unlock_page(clust->pages[i]);
+ }
+ result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
+ if (result)
+ goto error;
+ result = grab_coa(tc, cplug);
+ if (result)
+ goto error;
+ tc->len = tc->lsize = lbytes(clust->index, inode);
+ assert("edward-1513", tc->len == inode_cluster_size(inode));
+ dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
+ cplug->compress(get_coa(tc, cplug->h.id, tc->act),
+ tfm_input_data(clust), tc->len,
+ tfm_output_data(clust), &dst_len);
+ assert("edward-1514",
+ dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
+ }
+ finish_check_compressibility(inode, clust, cur_hint);
+ cont->state =
+ (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
+ DISPATCH_REMAINS_OLD :
+ DISPATCH_ASSIGNED_NEW);
+ return 0;
+ error:
+ put_page_cluster(clust, inode, READ_OP);
+ return result;
+}
+
+/* Cut disk cluster of index @idx */
+static int cut_disk_cluster(struct inode * inode, cloff_t idx)
+{
+ reiser4_key from, to;
+ assert("edward-1515", inode_file_plugin(inode) ==
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
+ build_body_key_cryptcompress(inode, clust_to_off(idx, inode), &from);
+ to = from;
+ set_key_offset(&to,
+ get_key_offset(&from) + inode_cluster_size(inode) - 1);
+ return reiser4_cut_tree(meta_subvol_tree(), &from, &to, inode, 0);
+}
+
+static int reserve_cryptcompress2unixfile(struct inode *inode)
+{
+ int ret;
+ reiser4_block_nr num_unformatted = cluster_nrpages(inode);
+ reiser4_subvol *subv = get_meta_subvol();
+ /*
+ * space required for one iteration of extent->tail conversion:
+ *
+ * 1. kill ctail items
+ *
+ * 2. insert N unformatted nodes
+ *
+ * 3. insert N (worst-case single-block
+ * extents) extent units.
+ *
+ * 4. drilling to the leaf level by coord_by_key()
+ *
+ * 5. possible update of stat-data
+ *
+ * reserve for 2
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(num_unformatted, BA_CAN_COMMIT, subv);
+ if (ret)
+ return ret;
+ /*
+ * reserve for 1,3,4,5
+ */
+ grab_space_enable();
+ return reiser4_grab_space(2 * subv->tree.height +
+ num_unformatted *
+ estimate_one_insert_into_item(&subv->tree) +
+ 1 + estimate_one_insert_item(&subv->tree) +
+ inode_file_plugin(inode)->estimate.update(inode),
+ BA_CAN_COMMIT, subv);
+}
+
+/**
+ * Convert cryptcompress file plugin to unix_file plugin.
+ */
+static int cryptcompress2unixfile(struct file *file, struct inode *inode,
+ struct dispatch_context *cont)
+{
+ int i;
+ int result = 0;
+ struct cryptcompress_info *cr_info;
+ struct unix_file_info *uf_info;
+ assert("edward-1516", cont->pages[0]->index == 0);
+
+ /* release all cryptcompress-specific resources */
+ cr_info = cryptcompress_inode_data(inode);
+ result = reserve_cryptcompress2unixfile(inode);
+ if (result)
+ goto out;
+ /* tell kill_hook to not truncate pages */
+ reiser4_inode_set_flag(inode, REISER4_FILE_IN_CONVERSION);
+ result = cut_disk_cluster(inode, 0);
+ if (result)
+ goto out;
+ /* captured jnode of cluster and assotiated resources (pages,
+ reserved disk space) were released by ->kill_hook() method
+ of the item plugin */
+
+ result = __cryptcompress2unixfile(file, inode);
+ if (result)
+ goto out;
+ /* At this point file is managed by unix file plugin */
+
+ uf_info = unix_file_inode_data(inode);
+
+ assert("edward-1518",
+ ergo(jprivate(cont->pages[0]),
+ !jnode_is_cluster_page(jprivate(cont->pages[0]))));
+ for(i = 0; i < cont->nr_pages; i++) {
+ assert("edward-1519", cont->pages[i]);
+ assert("edward-1520", PageUptodate(cont->pages[i]));
+
+ result = find_or_create_extent_unix_file(cont->pages[i]);
+ if (result)
+ break;
+ }
+ if (unlikely(result))
+ goto out;
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ result = reiser4_update_sd(inode);
+ out:
+ all_grabbed2free();
+ return result;
+}
+
+#define convert_file_plugin cryptcompress2unixfile
+
+/**
+ * This is called by ->write() method of a cryptcompress file plugin.
+ * Make a decision about the most reasonable file plugin id to manage
+ * the file.
+ */
+int write_dispatch_hook(struct file *file, struct inode *inode,
+ loff_t pos, struct cluster_handle *clust,
+ struct dispatch_context *cont)
+{
+ int result;
+ if (!conversion_enabled(inode))
+ return 0;
+ result = check_dispatch_point(inode, pos, clust, cont);
+ if (result || cont->state != DISPATCH_POINT)
+ return result;
+ result = read_check_compressibility(inode, clust, cont);
+ if (result)
+ return result;
+ if (cont->state == DISPATCH_REMAINS_OLD) {
+ put_page_cluster(clust, inode, READ_OP);
+ return disable_conversion(inode);
+ }
+ assert("edward-1543", cont->state == DISPATCH_ASSIGNED_NEW);
+ /*
+ * page cluster is grabbed and uptodate. It will be
+ * released with a pgset after plugin conversion is
+ * finished, see put_dispatch_context().
+ */
+ reiser4_unset_hint(clust->hint);
+ move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
+ return 0;
+}
+
+/**
+ * This is called by ->setattr() method of cryptcompress file plugin.
+ */
+int setattr_dispatch_hook(struct inode * inode)
+{
+ if (conversion_enabled(inode))
+ return disable_conversion(inode);
+ return 0;
+}
+
+static inline void init_dispatch_context(struct dispatch_context * cont)
+{
+ memset(cont, 0, sizeof(*cont));
+}
+
+static inline void done_dispatch_context(struct dispatch_context * cont,
+ struct inode * inode)
+{
+ if (cont->pages) {
+ __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
+ kfree(cont->pages);
+ }
+}
+
+static inline ssize_t reiser4_write_checks(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *off)
+{
+ ssize_t result;
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct kiocb iocb;
+ struct iov_iter iter;
+
+ init_sync_kiocb(&iocb, file);
+ iocb.ki_pos = *off;
+ iov_iter_init(&iter, WRITE, &iov, 1, count);
+
+ result = generic_write_checks(&iocb, &iter);
+ *off = iocb.ki_pos;
+ return result;
+}
+
+/*
+ * ->write() VFS file operation
+ *
+ * performs "intelligent" conversion in the FILE interface.
+ * Write a file in 3 steps (2d and 3d steps are optional).
+ */
+ssize_t reiser4_write_dispatch(struct file *file, const char __user *buf,
+ size_t count, loff_t *off)
+{
+ ssize_t result;
+ reiser4_context *ctx;
+ ssize_t written_old = 0; /* bytes written with initial plugin */
+ ssize_t written_new = 0; /* bytes written with new plugin */
+ struct dispatch_context cont;
+ struct inode * inode = file_inode(file);
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ current->backing_dev_info = inode_to_bdi(inode);
+ init_dispatch_context(&cont);
+ inode_lock(inode);
+
+ result = reiser4_write_checks(file, buf, count, off);
+ if (unlikely(result <= 0))
+ goto exit;
+ /**
+ * First step.
+ * Start write with initial file plugin.
+ * Keep a plugin schedule status at @cont (if any).
+ */
+ written_old = inode_file_plugin(inode)->write(file,
+ buf,
+ count,
+ off,
+ &cont);
+ if (cont.state != DISPATCH_ASSIGNED_NEW || written_old < 0)
+ goto exit;
+ /**
+ * Second step.
+ * New file plugin has been scheduled.
+ * Commit respective atom and pass management to the new plugin.
+ */
+ assert("edward-181", cont.pages[0] != NULL);
+ /*
+ * this will commit the whole logical cluster
+ * the file consists of
+ */
+ reiser4_sync_page(cont.pages[0]);
+
+ down_read(&reiser4_inode_data(inode)->conv_sem);
+ result = convert_file_plugin(file, inode, &cont);
+ up_read(&reiser4_inode_data(inode)->conv_sem);
+ if (result) {
+ warning("edward-1544",
+ "Inode %llu: file plugin conversion failed (%d)",
+ (unsigned long long)get_inode_oid(inode),
+ (int)result);
+ goto exit;
+ }
+ reiser4_txn_restart(ctx);
+ /**
+ * Third step:
+ * Finish write with the new file plugin.
+ */
+ assert("edward-1536",
+ inode_file_plugin(inode) ==
+ file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
+
+ written_new = inode_file_plugin(inode)->write(file,
+ buf + written_old,
+ count - written_old,
+ off,
+ NULL);
+ exit:
+ inode_unlock(inode);
+ done_dispatch_context(&cont, inode);
+ current->backing_dev_info = NULL;
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+
+ return written_old + (written_new < 0 ? 0 : written_new);
+}
+
+/*
+ * Dispatchers with "passive" protection for:
+ *
+ * ->open();
+ * ->read();
+ * ->ioctl();
+ * ->mmap();
+ * ->release();
+ * ->bmap().
+ */
+
+int reiser4_open_dispatch(struct inode *inode, struct file *file)
+{
+ return PROT_PASSIVE(int, open, (inode, file));
+}
+
+ssize_t reiser4_read_dispatch(struct file * file, char __user * buf,
+ size_t size, loff_t * off)
+{
+ struct inode * inode = file_inode(file);
+ return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
+}
+
+long reiser4_ioctl_dispatch(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct inode * inode = file_inode(filp);
+ return PROT_PASSIVE(int, ioctl, (filp, cmd, arg));
+}
+
+int reiser4_mmap_dispatch(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(file);
+ return PROT_PASSIVE(int, mmap, (file, vma));
+}
+
+int reiser4_release_dispatch(struct inode *inode, struct file *file)
+{
+ return PROT_PASSIVE(int, release, (inode, file));
+}
+
+sector_t reiser4_bmap_dispatch(struct address_space * mapping, sector_t lblock)
+{
+ struct inode *inode = mapping->host;
+ return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
+}
+
+/**
+ * NOTE: The following two methods are
+ * used only for loopback functionality.
+ * reiser4_write_end() can not cope with
+ * short writes for now.
+ */
+int reiser4_write_begin_dispatch(struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret = 0;
+ struct page *page;
+ pgoff_t index;
+ reiser4_context *ctx;
+ struct inode * inode = file_inode(file);
+
+ index = pos >> PAGE_SHIFT;
+ page = grab_cache_page_write_begin(mapping, index,
+ flags & AOP_FLAG_NOFS);
+ *pagep = page;
+ if (!page)
+ return -ENOMEM;
+
+ ctx = reiser4_init_context(file_inode(file)->i_sb);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto err2;
+ }
+ /*
+ * reserve space to update stat-data:
+ * one when updating file size and one when updating mtime/ctime
+ */
+ ret = reiser4_grab_space_force(2 * estimate_update_common(inode),
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (ret)
+ goto err1;
+ ret = PROT_PASSIVE(int, write_begin, (file, page, pos, len, fsdata));
+ if (unlikely(ret))
+ goto err1;
+ /* Success. Resorces will be released in write_end_dispatch */
+ return 0;
+ err1:
+ reiser4_exit_context(ctx);
+ err2:
+ unlock_page(page);
+ put_page(page);
+ return ret;
+}
+
+int reiser4_write_end_dispatch(struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned copied,
+ struct page *page,
+ void *fsdata)
+{
+ int ret;
+ reiser4_context *ctx;
+ struct inode *inode = page->mapping->host;
+
+ assert("umka-3101", file != NULL);
+ assert("umka-3102", page != NULL);
+ assert("umka-3093", PageLocked(page));
+
+ ctx = get_current_context();
+
+ SetPageUptodate(page);
+ set_page_dirty_notag(page);
+
+ ret = PROT_PASSIVE(int, write_end, (file, page, pos, copied, fsdata));
+ put_page(page);
+
+ /* don't commit transaction under inode semaphore */
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return ret == 0 ? copied : ret;
+}
+
+/*
+ * Dispatchers without protection
+ */
+int reiser4_setattr_dispatch(struct dentry *dentry, struct iattr *attr)
+{
+ return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/file.h linux-5.10.2/fs/reiser4/plugin/file/file.h
--- linux-5.10.2.orig/fs/reiser4/plugin/file/file.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/file.h 2020-12-23 16:07:46.124813217 +0100
@@ -0,0 +1,382 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* this file contains declarations of methods implementing
+ file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID,
+ STRIPED_FILE_PLUGIN_ID and SYMLINK_FILE_PLUGIN_ID) */
+
+#if !defined( __REISER4_FILE_H__ )
+#define __REISER4_FILE_H__
+
+/* possible states in dispatching process */
+typedef enum {
+ DISPATCH_INVAL_STATE, /* invalid state */
+ DISPATCH_POINT, /* dispatching point has been achieved */
+ DISPATCH_REMAINS_OLD, /* made a decision to manage by old plugin */
+ DISPATCH_ASSIGNED_NEW /* a new plugin has been assigned */
+} dispatch_state;
+
+/* UPdate eXtent flags */
+#define UPX_TRUNCATE (1 << 0)
+#define UPX_PROXY_FULL (1 << 1)
+
+struct dispatch_context {
+ int nr_pages;
+ struct page **pages;
+ dispatch_state state;
+};
+
+/*
+ * Declarations of methods provided for VFS.
+ */
+
+/* inode operations */
+int reiser4_setattr_dispatch(struct dentry *, struct iattr *);
+
+/* file operations */
+ssize_t reiser4_read_dispatch(struct file *, char __user *buf,
+ size_t count, loff_t *off);
+ssize_t reiser4_write_dispatch(struct file *, const char __user *buf,
+ size_t count, loff_t * off);
+long reiser4_ioctl_dispatch(struct file *filp, unsigned int cmd,
+ unsigned long arg);
+int reiser4_mmap_dispatch(struct file *, struct vm_area_struct *);
+int reiser4_open_dispatch(struct inode *inode, struct file *file);
+int reiser4_release_dispatch(struct inode *, struct file *);
+int reiser4_sync_file_common(struct file *, loff_t, loff_t, int datasync);
+int reiser4_sync_page(struct page *page);
+
+/* address space operations */
+int reiser4_readpage_dispatch(struct file *, struct page *);
+int reiser4_readpages_dispatch(struct file *, struct address_space *,
+ struct list_head *, unsigned);
+int reiser4_writepages_dispatch(struct address_space *,
+ struct writeback_control *);
+int reiser4_write_begin_dispatch(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+int reiser4_write_end_dispatch(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+sector_t reiser4_bmap_dispatch(struct address_space *, sector_t lblock);
+
+/*
+ * Private methods of unix-file plugin
+ * (UNIX_FILE_PLUGIN_ID)
+ */
+
+int build_body_key_unix_file(struct inode *inode, loff_t off,
+ reiser4_key *key);
+/* private inode operations */
+int setattr_unix_file(struct dentry *, struct iattr *);
+
+/* private file operations */
+
+ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
+ loff_t *off);
+ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
+ loff_t * off, struct dispatch_context * cont);
+int ioctl_unix_file(struct file *, unsigned int cmd, unsigned long arg);
+int mmap_unix_file(struct file *, struct vm_area_struct *);
+int open_unix_file(struct inode *, struct file *);
+int release_unix_file(struct inode *, struct file *);
+
+/* private address space operations */
+int readpage_unix_file(struct file *, struct page *);
+int readpages_unix_file(struct file*, struct address_space*, struct list_head*,
+ unsigned);
+int reiser4_writepages_generic(struct address_space *mapping,
+ struct writeback_control *wbc,
+ int(*capture_anon_page_fn)(struct page *),
+ int(*commit_file_atoms_fn)(struct inode *));
+int writepages_unix_file(struct address_space *, struct writeback_control *);
+int write_begin_unix_file(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata);
+int write_end_unix_file(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata);
+sector_t bmap_unix_file(struct address_space *, sector_t lblock);
+
+/* other private methods */
+int delete_object_unix_file(struct inode *);
+int flow_by_inode_unix_file(struct inode *, const char __user *buf,
+ int user, loff_t, loff_t, rw_op, flow_t *);
+int owns_item_unix_file(const struct inode *, const coord_t *);
+void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
+ const reiser4_key *sd_key, int create);
+
+/*
+ * Private methods of cryptcompress file plugin
+ * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
+ */
+
+/* private inode operations */
+int setattr_cryptcompress(struct dentry *, struct iattr *);
+
+/* private file operations */
+ssize_t read_cryptcompress(struct file *, char __user *buf,
+ size_t count, loff_t *off);
+ssize_t write_cryptcompress(struct file *, const char __user *buf,
+ size_t count, loff_t * off,
+ struct dispatch_context *cont);
+int ioctl_cryptcompress(struct file *, unsigned int cmd, unsigned long arg);
+int mmap_cryptcompress(struct file *, struct vm_area_struct *);
+int open_cryptcompress(struct inode *, struct file *);
+int release_cryptcompress(struct inode *, struct file *);
+
+/* private address space operations */
+int readpage_cryptcompress(struct file *, struct page *);
+int readpages_cryptcompress(struct file*, struct address_space*,
+ struct list_head*, unsigned);
+int writepages_cryptcompress(struct address_space *,
+ struct writeback_control *);
+int write_begin_cryptcompress(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata);
+int write_end_cryptcompress(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata);
+sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
+
+/* other private methods */
+
+int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
+ int user, loff_t, loff_t, rw_op, flow_t *);
+int build_body_key_cryptcompress(struct inode *, loff_t off, reiser4_key *);
+int create_object_cryptcompress(struct inode *, struct inode *,
+ reiser4_object_create_data *, oid_t *);
+int delete_object_cryptcompress(struct inode *);
+void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
+ const reiser4_key *sd_key, int create);
+int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed,
+ struct inode *object, int truncate,
+ int *progress);
+void destroy_inode_cryptcompress(struct inode *);
+
+/*
+ * Private methods of striped-file plugin
+ * (STRIPED_FILE_PLUGIN_ID)
+ */
+int build_body_key_stripe(struct inode *inode, loff_t off,
+ reiser4_key *key);
+int flow_by_inode_stripe(struct inode *inode, const char __user *buf, int user,
+ loff_t size, loff_t off, rw_op op, flow_t *flow);
+int create_object_stripe(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *data, oid_t *oid);
+void init_inode_data_stripe(struct inode *, reiser4_object_create_data *,
+ const reiser4_key *sd_key, int create);
+int open_stripe(struct inode *, struct file *);
+int release_stripe(struct inode *inode, struct file *file);
+ssize_t read_stripe(struct file *file, char __user *buf,
+ size_t read_amount, loff_t *off);
+ssize_t write_stripe(struct file *file, const char __user *buf, size_t count,
+ loff_t *pos, struct dispatch_context *cont);
+int readpages_stripe(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+int readpage_stripe(struct file *file, struct page *page);
+int writepages_stripe(struct address_space *, struct writeback_control *);
+int setattr_stripe(struct dentry *, struct iattr *);
+int delete_object_stripe(struct inode *);
+int cut_tree_worker_stripe(tap_t *, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed,
+ struct inode *object, int truncate,
+ int *progress);
+int write_begin_stripe(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata);
+int write_end_stripe(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata);
+int ioctl_stripe(struct file *filp, unsigned int cmd, unsigned long arg);
+int migrate_stripe(struct inode *object, u64 *dst_id);
+
+/*
+ * Private methods of symlink file plugin
+ * (SYMLINK_FILE_PLUGIN_ID)
+ */
+int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
+ reiser4_object_create_data *, oid_t *);
+void destroy_inode_symlink(struct inode *);
+
+/*
+ * all the write into unix file is performed by item write method. Write method
+ * of unix file plugin only decides which item plugin (extent or tail) and in
+ * which mode (one from the enum below) to call
+ */
+typedef enum {
+ FIRST_ITEM = 1,
+ APPEND_ITEM = 2,
+ OVERWRITE_ITEM = 3
+} write_mode_t;
+
+/* unix file may be in one the following states */
+typedef enum {
+ UF_CONTAINER_UNKNOWN = 0,
+ UF_CONTAINER_TAILS = 1,
+ UF_CONTAINER_EXTENTS = 2,
+ UF_CONTAINER_EMPTY = 3
+} file_container_t;
+
+struct formatting_plugin;
+struct inode;
+
+/* unix file plugin specific part of reiser4 inode */
+struct unix_file_info {
+ /*
+ * this read-write lock protects file containerization change. Accesses
+ * which do not change file containerization (see file_container_t)
+ * (read, readpage, writepage, write (until tail conversion is
+ * involved)) take read-lock. Accesses which modify file
+ * containerization (truncate, conversion from tail to extent and back)
+ * take write-lock.
+ */
+ struct rw_semaphore latch;
+ /* this enum specifies which items are used to build the file */
+ file_container_t container;
+ /*
+ * plugin which controls when file is to be converted to extents and
+ * back to tail
+ */
+ struct formatting_plugin *tplug;
+ /* if this is set, file is in exclusive use */
+ int exclusive_use;
+#if REISER4_DEBUG
+ /* pointer to task struct of thread owning exclusive access to file */
+ void *ea_owner;
+ atomic_t nr_neas;
+ void *last_reader;
+#endif
+};
+
+struct unix_file_info *unix_file_inode_data(const struct inode *inode);
+void get_exclusive_access(struct unix_file_info *);
+void drop_exclusive_access(struct unix_file_info *);
+void get_nonexclusive_access(struct unix_file_info *);
+void drop_nonexclusive_access(struct unix_file_info *);
+int try_to_get_nonexclusive_access(struct unix_file_info *);
+int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
+ struct inode *);
+int find_file_item_nohint(coord_t *, lock_handle *,
+ const reiser4_key *, znode_lock_mode,
+ struct inode *);
+
+int load_file_hint(struct file *, hint_t *);
+void save_file_hint(struct file *, const hint_t *);
+
+#include "../item/extent.h"
+#include "../item/tail.h"
+#include "../item/ctail.h"
+
+struct uf_coord {
+ coord_t coord;
+ lock_handle *lh;
+ int valid;
+ union {
+ struct extent_coord_extension extent;
+ struct tail_coord_extension tail;
+ struct ctail_coord_extension ctail;
+ } extension;
+};
+
+#include "../../forward.h"
+#include "../../seal.h"
+#include "../../lock.h"
+
+/*
+ * This structure is used to speed up file operations (reads and writes). A
+ * hint is a suggestion about where a key resolved to last time. A seal
+ * indicates whether a node has been modified since a hint was last recorded.
+ * You check the seal, and if the seal is still valid, you can use the hint
+ * without traversing the tree again.
+ */
+struct hint {
+ seal_t seal; /* a seal over last file item accessed */
+ uf_coord_t ext_coord;
+ loff_t offset;
+ znode_lock_mode mode;
+ lock_handle lh;
+};
+
+static inline int hint_is_valid(hint_t * hint)
+{
+ return hint->ext_coord.valid;
+}
+
+static inline void hint_set_valid(hint_t * hint)
+{
+ hint->ext_coord.valid = 1;
+}
+
+static inline void hint_clr_valid(hint_t * hint)
+{
+ hint->ext_coord.valid = 0;
+}
+
+int load_file_hint(struct file *, hint_t *);
+void save_file_hint(struct file *, const hint_t *);
+void hint_init_zero(hint_t *);
+void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
+int hint_is_set(const hint_t *);
+void reiser4_unset_hint(hint_t *);
+
+int reiser4_sync_page_list(struct inode *inode);
+int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
+int reserve_cut_iteration(struct inode *inode);
+int cut_file_items(struct inode *, loff_t new_size,
+ int update_sd, loff_t cur_size,
+ int (*update_actor) (struct inode *, loff_t, int));
+#if REISER4_DEBUG
+
+/* return 1 is exclusive access is obtained, 0 - otherwise */
+static inline int ea_obtained(struct unix_file_info * uf_info)
+{
+ int ret;
+
+ ret = down_read_trylock(&uf_info->latch);
+ if (ret)
+ up_read(&uf_info->latch);
+ return !ret;
+}
+
+#endif
+
+int tail2extent(struct unix_file_info *);
+int extent2tail(struct file *, struct unix_file_info *);
+
+int goto_right_neighbor(coord_t *, lock_handle *);
+int find_or_create_extent_stripe(struct page *page, unsigned flags);
+int find_or_create_extent_unix_file(struct page *);
+int reiser4_setattr_generic(struct dentry *dentry, struct iattr *attr,
+ int (*truncate_file_body_fn)(struct inode *,
+ struct iattr *));
+int reiser4_readpages_filler_generic(void *data,
+ struct page *page, int striped);
+int reiser4_readpages_generic(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages,
+ int (*filler)(void *data, struct page *page));
+int reiser4_write_begin_common(struct file *file, struct page *page,
+ loff_t pos, unsigned len,
+ int(*readpage_fn)(struct file *, struct page *));
+int equal_to_ldk(znode *, const reiser4_key *);
+
+void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
+
+static inline int cbk_errored(int cbk_result)
+{
+ return (cbk_result != CBK_COORD_NOTFOUND
+ && cbk_result != CBK_COORD_FOUND);
+}
+
+/* __REISER4_FILE_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/invert.c linux-5.10.2/fs/reiser4/plugin/file/invert.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/invert.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/invert.c 2020-12-23 16:07:46.124813217 +0100
@@ -0,0 +1,493 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
+ buffer, without having a separate buffer for each 8 byte or so file. Inverts are the way to do that. An invert
+ provides you with the contents of a set of subfiles plus its own contents. It is a file which inherits other files
+ when you read it, and allows you to write to it and through it to the files that it inherits from. In order for it
+ to know which subfiles each part of your write should go into, there must be delimiters indicating that. It tries to
+ make that easy for you by providing those delimiters in what you read from it.
+
+ When you read it, an invert performs an inverted assignment. Instead of taking an assignment command and writing a
+ bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
+ would create those files. But which files? Well, that must be specified in the body of the invert using a special
+ syntax, and that specification is called the invert of the assignment.
+
+ When written to, an invert performs the assignment command that is written
+ to it, and modifies its own body to contain the invert of that
+ assignment.
+
+ In other words, writing to an invert file what you have read from it
+ is the identity operation.
+
+ Malformed assignments cause write errors. Partial writes are not
+ supported in v4.0, but will be.
+
+ Example:
+
+ If an invert contains:
+
+ /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
+
+======================
+Each element in this definition should be an invert, and all files
+should be called recursively - too. This is bad. If one of the
+included files in not a regular or invert file, then we can't read
+main file.
+
+I think to make it is possible easier:
+
+internal structure of invert file should be like symlink file. But
+read and write method should be explitely indicated in i/o operation..
+
+By default we read and write (if probably) as symlink and if we
+specify ..invert at reading time that too we can specify it at write time.
+
+example:
+/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
+will create /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
+
+read of /my_invert_file/..invert will be
+/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
+
+but read of /my_invert_file/ will be
+The contents of filenameAsome text stored in the invertThe contents of filenameB
+
+we also can creat this file as
+/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
+will create /my_invert_file , and use existing files /filenameA and /filenameB.
+
+and when we will read it will be as previously invert file.
+
+This is correct?
+
+ vv
+DEMIDOV-FIXME-HANS:
+
+Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
+
+Do you agree? Discuss it on reiserfs-list....
+
+-Hans
+=======================
+
+ Then a read will return:
+
+ /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
+
+ and a write of the line above to the invert will set the contents of
+ the invert and filenameA and filenameB to their original values.
+
+ Note that the contents of an invert have no influence on the effect
+ of a write unless the write is a partial write (and a write of a
+ shorter file without using truncate first is a partial write).
+
+ truncate() has no effect on filenameA and filenameB, it merely
+ resets the value of the invert.
+
+ Writes to subfiles via the invert are implemented by preceding them
+ with truncates.
+
+ Parse failures cause write failures.
+
+ Questions to ponder: should the invert be acted on prior to file
+ close when writing to an open filedescriptor?
+
+ Example:
+
+ If an invert contains:
+
+ "(This text and a pair of quotes are all that is here.)
+
+Then a read will return:
+
+ "(This text and a pair of quotes are all that is here.)
+
+*/
+
+/* OPEN method places a struct file in memory associated with invert body
+ and returns something like file descriptor to the user for the future access
+ to the invert file.
+ During opening we parse the body of invert and get a list of the 'entryes'
+ (that describes all its subfiles) and place pointer on the first struct in
+ reiserfs-specific part of invert inode (arbitrary decision).
+
+ Each subfile is described by the struct inv_entry that has a pointer @sd on
+ in-core based stat-data and a pointer on struct file @f (if we find that the
+ subfile uses more then one unformated node (arbitrary decision), we load
+ struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
+ of some other information we need)
+
+ Since READ and WRITE methods for inverts were formulated in assignment
+ language, they don't contain arguments 'size' and 'offset' that make sense
+ only in ordinary read/write methods.
+
+ READ method is a combination of two methods:
+ 1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
+ with @f != 0, this method uses pointer on struct file as an argument
+ 2) read method for inode-less files with @sd != 0, this method uses
+ in-core based stat-data instead struct file as an argument.
+ in the first case we don't use pagecache, just copy data that we got after
+ cbk() into userspace.
+
+ WRITE method for invert files is more complex.
+ Besides declared WRITE-interface in assignment languageb above we need
+ to have an opportunity to edit unwrapped body of invert file with some
+ text editor, it means we need GENERIC WRITE METHOD for invert file:
+
+ my_invert_file/..invert <- "string"
+
+ this method parses "string" and looks for correct subfile signatures, also
+ the parsing process splits this "string" on the set of flows in accordance
+ with the set of subfiles specified by this signarure.
+ The found list of signatures #S is compared with the opened one #I of invert
+ file. If it doesn't have this one (#I==0, it will be so for instance if we
+ have just create this invert file) the write method assignes found signature
+ (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
+ itself to the some write methods for ordinary or light-weight, or call itself
+ recursively for invert files with corresponding flows.
+ I am not sure, but the list of signatures looks like what mr.Demidov means
+ by 'delimiters'.
+
+ The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
+ and cause delete (create new) subfiles (arbitrary decision - it may looks
+ too complex, but this interface will be the completest). The order of entries
+ of list #S (#I) and inherited order on #I (#S) must coincide.
+ The other parsing results give malformed signature that aborts READ method
+ and releases all resources.
+
+ Format of subfile (entry) signature:
+
+ "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
+
+ Legend:
+
+ START_MAGIC - keyword indicates the start of subfile signature;
+
+ <> indicates the start of 'subfile metadata', that is the pair
+ (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
+
+ TYPE - the string "type" indicates the start of one of the three words:
+ - ORDINARY_FILE,
+ - LIGHT_WEIGHT_FILE,
+ - INVERT_FILE;
+
+ LOOKUP_ARG - lookup argument depends on previous type:
+ */
+
+ /************************************************************/
+ /* TYPE * LOOKUP ARGUMENT */
+ /************************************************************/
+ /* LIGH_WEIGHT_FILE * stat-data key */
+ /************************************************************/
+ /* ORDINARY_FILE * filename */
+ /************************************************************/
+ /* INVERT_FILE * filename */
+ /************************************************************/
+
+ /* where:
+ *stat-data key - the string contains stat data key of this subfile, it will be
+ passed to fast-access lookup method for light-weight files;
+ *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
+ for ordinary and invert files;
+
+ SUBFILE_BODY - data of this subfile (it will go to the flow)
+ END_MAGIC - the keyword indicates the end of subfile signature.
+
+ The other simbols inside the signature interpreted as 'unformatted content',
+ which is available with VFS's read_link() (arbitraruy decision).
+
+ NOTE: Parse method for a body of invert file uses mentioned signatures _without_
+ subfile bodies.
+
+ Now the only unclear thing is WRITE in regular light-weight subfile A that we
+ can describe only in assignment language:
+
+ A <- "some_string"
+
+ I guess we don't want to change stat-data and body items of file A
+ if this file exist, and size(A) != size("some_string") because this operation is
+ expencive, so we only do the partial write if size(A) > size("some_string")
+ and do truncate of the "some_string", and then do A <- "truncated string", if
+ size(A) < size("some_string"). This decision is also arbitrary..
+ */
+
+/* here is infrastructure for formated flows */
+
+#define SUBFILE_HEADER_MAGIC 0x19196605
+#define FLOW_HEADER_MAGIC 0x01194304
+
+#include "../plugin.h"
+#include "../../debug.h"
+#include "../../forward.h"
+#include "../object.h"
+#include "../item/item.h"
+#include "../item/static_stat.h"
+#include "../../dformat.h"
+#include "../znode.h"
+#include "../inode.h"
+
+#include <linux/types.h>
+#include <linux/fs.h> /* for struct file */
+#include <linux/list.h> /* for struct list_head */
+
+typedef enum {
+ LIGHT_WEIGHT_FILE,
+ ORDINARY_FILE,
+ INVERT_FILE
+} inv_entry_type;
+
+typedef struct flow_header {
+ d32 fl_magic;
+ d16 fl_nr; /* number of subfiles in the flow */
+};
+
+typedef struct subfile_header {
+ d32 sh_magic; /* subfile magic */
+ d16 sh_type; /* type of subfile: light-weight, ordinary, invert */
+ d16 sh_arg_len; /* lenght of lookup argument (filename, key) */
+ d32 sh_body_len; /* lenght of subfile body */
+};
+
+/* functions to get/set fields of flow header */
+
+static void fl_set_magic(flow_header * fh, __u32 value)
+{
+ cputod32(value, &fh->fh_magic);
+}
+
+static __u32 fl_get_magic(flow_header * fh)
+{
+ return d32tocpu(&fh->fh_magic);
+}
+static void fl_set_number(flow_header * fh, __u16 value)
+{
+ cputod16(value, &fh->fh_nr);
+}
+static unsigned fl_get_number(flow_header * fh)
+{
+ return d16tocpu(&fh->fh_nr);
+}
+
+/* functions to get/set fields of subfile header */
+
+static void sh_set_magic(subfile_header * sh, __u32 value)
+{
+ cputod32(value, &sh->sh_magic);
+}
+
+static __u32 sh_get_magic(subfile_header * sh)
+{
+ return d32tocpu(&sh->sh_magic);
+}
+static void sh_set_type(subfile_header * sh, __u16 value)
+{
+ cputod16(value, &sh->sh_magic);
+}
+static unsigned sh_get_type(subfile_header * sh)
+{
+ return d16tocpu(&sh->sh_magic);
+}
+static void sh_set_arg_len(subfile_header * sh, __u16 value)
+{
+ cputod16(value, &sh->sh_arg_len);
+}
+static unsigned sh_get_arg_len(subfile_header * sh)
+{
+ return d16tocpu(&sh->sh_arg_len);
+}
+static void sh_set_body_len(subfile_header * sh, __u32 value)
+{
+ cputod32(value, &sh->sh_body_len);
+}
+
+static __u32 sh_get_body_len(subfile_header * sh)
+{
+ return d32tocpu(&sh->sh_body_len);
+}
+
+/* in-core minimal stat-data, light-weight analog of inode */
+
+struct incore_sd_base {
+ umode_t isd_mode;
+ nlink_t isd_nlink;
+ loff_t isd_size;
+ char *isd_data; /* 'subflow' to write */
+};
+
+/* open invert create a list of invert entries,
+ every entry is represented by structure inv_entry */
+
+struct inv_entry {
+ struct list_head *ie_list;
+ struct file *ie_file; /* this is NULL if the file doesn't
+ have unformated nodes */
+ struct incore_sd_base *ie_sd; /* inode-less analog of struct file */
+};
+
+/* allocate and init invert entry */
+
+static struct inv_entry *allocate_inv_entry(void)
+{
+ struct inv_entry *inv_entry;
+
+ inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
+ if (!inv_entry)
+ return ERR_PTR(RETERR(-ENOMEM));
+ inv_entry->ie_file = NULL;
+ inv_entry->ie_sd = NULL;
+ INIT_LIST_HEAD(&inv_entry->ie_list);
+ return inv_entry;
+}
+
+static int put_inv_entry(struct inv_entry *ientry)
+{
+ int result = 0;
+
+ assert("edward-96", ientry != NULL);
+ assert("edward-97", ientry->ie_list != NULL);
+
+ list_del(ientry->ie_list);
+ if (ientry->ie_sd != NULL) {
+ kfree(ientry->ie_sd);
+ kfree(ientry);
+ }
+ if (ientry->ie_file != NULL)
+ result = filp_close(ientry->file, NULL);
+ return result;
+}
+
+static int allocate_incore_sd_base(struct inv_entry *inv_entry)
+{
+ struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
+ assert("edward-99", inv_entry->ie_inode = NULL);
+ assert("edward-100", inv_entry->ie_sd = NULL);
+
+ isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
+ if (!isd_base)
+ return RETERR(-ENOMEM);
+ inv_entry->ie_sd = isd_base;
+ return 0;
+}
+
+/* this can be installed as ->init_inv_entry () method of
+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
+ Copies data from on-disk stat-data format into light-weight analog of inode .
+ Doesn't hanlde stat-data extensions. */
+
+static void sd_base_load(struct inv_entry *inv_entry, char *sd)
+{
+ reiser4_stat_data_base *sd_base;
+
+ assert("edward-101", inv_entry != NULL);
+ assert("edward-101", inv_entry->ie_sd != NULL);
+ assert("edward-102", sd != NULL);
+
+ sd_base = (reiser4_stat_data_base *) sd;
+ inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
+ inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
+ inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
+ inv_entry->incore_sd_base->isd_data = NULL;
+}
+
+/* initialise incore stat-data */
+
+static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
+{
+ reiser4_plugin *plugin = item_plugin_by_coord(coord);
+ void *body = item_body_by_coord(coord);
+
+ assert("edward-103", inv_entry != NULL);
+ assert("edward-104", plugin != NULL);
+ assert("edward-105", body != NULL);
+
+ sd_base_load(inv_entry, body);
+}
+
+/* takes a key or filename and allocates new invert_entry,
+ init and adds it into the list,
+ we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
+
+int get_inv_entry(struct inode *invert_inode, /* inode of invert's body */
+ inv_entry_type type, /* LIGHT-WEIGHT or ORDINARY */
+ const reiser4_key * key, /* key of invert entry stat-data */
+ char *filename, /* filename of the file to be opened */
+ int flags, int mode)
+{
+ int result;
+ struct inv_entry *ientry;
+
+ assert("edward-107", invert_inode != NULL);
+
+ ientry = allocate_inv_entry();
+ if (IS_ERR(ientry))
+ return (PTR_ERR(ientry));
+
+ if (type == LIGHT_WEIGHT_FILE) {
+ coord_t coord;
+ lock_handle lh;
+
+ assert("edward-108", key != NULL);
+
+ init_coord(&coord);
+ init_lh(&lh);
+ result =
+ lookup_sd_by_key(meta_subvol_tree(),
+ ZNODE_READ_LOCK, &coord, &lh, key);
+ if (result == 0)
+ init_incore_sd_base(ientry, coord);
+
+ done_lh(&lh);
+ done_coord(&coord);
+ return (result);
+ } else {
+ struct file *file = filp_open(filename, flags, mode);
+ /* FIXME_EDWARD here we need to check if we
+ did't follow to any mount point */
+
+ assert("edward-108", filename != NULL);
+
+ if (IS_ERR(file))
+ return (PTR_ERR(file));
+ ientry->ie_file = file;
+ return 0;
+ }
+}
+
+/* takes inode of invert, reads the body of this invert, parses it,
+ opens all invert entries and return pointer on the first inv_entry */
+
+struct inv_entry *open_invert(struct file *invert_file)
+{
+
+}
+
+ssize_t subfile_read(struct *invert_entry, flow * f)
+{
+
+}
+
+ssize_t subfile_write(struct *invert_entry, flow * f)
+{
+
+}
+
+ssize_t invert_read(struct *file, flow * f)
+{
+
+}
+
+ssize_t invert_write(struct *file, flow * f)
+{
+
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/Makefile linux-5.10.2/fs/reiser4/plugin/file/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/file/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/Makefile 2020-12-23 16:07:46.124813217 +0100
@@ -0,0 +1,8 @@
+obj-$(CONFIG_REISER4_FS) += file_plugins.o
+
+file_plugins-objs := \
+ file.o \
+ tail_conversion.o \
+ symlink.o \
+ cryptcompress.o \
+ stripe.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/stripe.c linux-5.10.2/fs/reiser4/plugin/file/stripe.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/stripe.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/stripe.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,1313 @@
+/*
+ Copyright (c) 2018-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+/*
+ * Implementation of regular files with distributed bodies.
+ *
+ * Logical unit of distribution in such file is called "stripe".
+ * Every stripe, which got physical addresses, is composed of extents
+ * (IO units), and every extent is a set of filesystem blocks (allocation
+ * units) with contiguous disk addresses.
+ * Neighboring extents of any two adjacent (in the logical order) stripes,
+ * which got to the same device, get merged at the stripe boundary if
+ * their physical addresses are adjusent.
+ * In the storage tree extents are represented by extent pointers (items)
+ * of EXTENT41_POINTER_ID. Extent pointer's key is calculated like for
+ * classic unix files (UNIX_FILE_PLUGIN_ID) except the ordering component,
+ * which in our case contains ID of a brick (subvolume), where that extent
+ * should be stored in.
+ * Holes in a striped file are not represented by any items.
+ */
+
+#include "../../inode.h"
+#include "../../super.h"
+#include "../../tree_walk.h"
+#include "../../carry.h"
+#include "../../page_cache.h"
+#include "../object.h"
+#include "../cluster.h"
+#include "../../safe_link.h"
+#include "../volume/volume.h"
+
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/syscalls.h>
+
+reiser4_block_nr estimate_migration_iter(void);
+reiser4_block_nr estimate_write_stripe_meta(int count);
+int readpages_filler_generic(void *data, struct page *page, int striped);
+
+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
+{
+ reiser4_extent *ext;
+
+ ext = (reiser4_extent *) (zdata(node) + offset);
+ return ext;
+}
+
+int build_body_key_stripe(struct inode *inode, loff_t off, reiser4_key *key)
+{
+ build_body_key_common(inode, key);
+ set_key_ordering(key, KEY_ORDERING_MASK /* max value */);
+ set_key_offset(key, (__u64) off);
+ return 0;
+}
+
+int flow_by_inode_stripe(struct inode *inode,
+ const char __user *buf, int user,
+ loff_t size, loff_t off,
+ rw_op op, flow_t *flow)
+{
+ flow->length = size;
+ memcpy(&flow->data, &buf, sizeof(buf));
+ flow->user = user;
+ flow->op = op;
+ /*
+ * calculate key of write position and insert it into flow->key
+ */
+ return build_body_key_stripe(inode, off, &flow->key);
+}
+
+/*
+ * Tree search with a sealing technique for striped files
+ *
+ * To save CPU resources, every time before releasing a longterm lock
+ * we "seal" a position in the tree, which represents an existing object.
+ * Next time when we want to lock a positon in the tree, we check the seal.
+ * If it is unbroken, and it was created for a suitable object, we don't
+ * perform an expensive tree traversal. Instead, we lock the "sealed" node
+ * and perform fast lookup from the "sealed" position.
+ */
+
+#if REISER4_DEBUG
+
+static inline int equal_to_ldk_nonprec(znode *node, const reiser4_key *key)
+{
+ int ret;
+
+ read_lock_dk(znode_get_tree(node));
+ ret = all_but_ordering_keyeq(key, znode_get_ld_key(node));
+ read_unlock_dk(znode_get_tree(node));
+ return ret;
+}
+#endif
+
+static inline int equal_to_rdk_nonprec(znode *node, const reiser4_key *key)
+{
+ int ret;
+
+ read_lock_dk(znode_get_tree(node));
+ ret = all_but_ordering_keyeq(key, znode_get_rd_key(node));
+ read_unlock_dk(znode_get_tree(node));
+ return ret;
+}
+
+/**
+ * Check if the seal was created against the previous block pointer.
+ * And if so, then validate it.
+ */
+static int hint_validate(hint_t *hint, reiser4_tree *tree,
+ const reiser4_key *key, znode_lock_mode lock_mode)
+{
+ reiser4_key vkey;
+
+ if (!hint || !hint_is_set(hint) || hint->mode != lock_mode ||
+ get_key_offset(key) != hint->offset + PAGE_SIZE)
+ return RETERR(-E_REPEAT);
+
+ assert("edward-2377", hint->ext_coord.lh == &hint->lh);
+
+ memcpy(&vkey, key, sizeof(vkey));
+ set_key_offset(&vkey, hint->offset);
+
+ return reiser4_seal_validate(&hint->seal, tree,
+ &hint->ext_coord.coord, &vkey,
+ hint->ext_coord.lh, lock_mode,
+ ZNODE_LOCK_LOPRI);
+}
+
+/**
+ * Search-by-key procedure optimized for sequential operations
+ * by using "sealing" technique.
+ *
+ * @key: key of a block pointer we are looking for. That key is
+ * not precise, that is we don't know its "ordering" component.
+ */
+int find_stripe_item(hint_t *hint, const reiser4_key *key,
+ znode_lock_mode lock_mode, struct inode *inode)
+{
+ int ret;
+ coord_t *coord;
+ coord_t rcoord;
+ lock_handle *lh;
+ struct extent_coord_extension *ext_coord;
+
+ assert("edward-2378", hint != NULL);
+ assert("edward-2379", inode != NULL);
+ assert("edward-2380", reiser4_schedulable());
+ assert("edward-2381", (get_key_offset(key) & (PAGE_SIZE - 1)) == 0);
+ assert("edward-2382", inode_file_plugin(inode) ==
+ file_plugin_by_id(STRIPED_FILE_PLUGIN_ID));
+
+ coord = &hint->ext_coord.coord;
+ lh = hint->ext_coord.lh;
+ init_lh(lh);
+
+ ret = hint_validate(hint, meta_subvol_tree(), key, lock_mode);
+ if (ret)
+ goto nohint;
+ /*
+ * we always seal only valid coord of existing block pointer
+ */
+ assert("edward-2385",
+ WITH_DATA(coord->node, coord_is_existing_unit(coord)));
+ hint->ext_coord.valid = 1;
+ ext_coord = &hint->ext_coord.extension.extent;
+
+ /* fast lookup in the sealed coord locality */
+
+ ext_coord->pos_in_unit ++;
+ if (ext_coord->pos_in_unit < ext_coord->width)
+ /*
+ * found within the unit specified by @coord
+ */
+ return CBK_COORD_FOUND;
+ /*
+ * end of unit is reached. Try to move to next unit
+ */
+ ext_coord->pos_in_unit = 0;
+ coord->unit_pos ++;
+ if (coord->unit_pos < ext_coord->nr_units) {
+ /*
+ * found within next unit. Update coord extension
+ */
+ ext_coord->ext_offset += sizeof(reiser4_extent);
+ ext_coord->width =
+ extent_get_width(ext_by_offset(coord->node,
+ ext_coord->ext_offset));
+ ON_DEBUG(ext_coord->extent =
+ *ext_by_offset(coord->node, ext_coord->ext_offset));
+ return CBK_COORD_FOUND;
+ }
+ /*
+ * end of item reached. Try to find in the next item at the right
+ */
+ coord->unit_pos --;
+ coord->between = AFTER_UNIT;
+ hint->ext_coord.valid = 0; /* moving to the next item invalidates
+ the coord extension */
+ ret = zload(lh->node);
+ if (ret) {
+ done_lh(lh);
+ return ret;
+ }
+ coord_dup(&rcoord, coord);
+ if (!coord_next_item(&rcoord)) {
+ /*
+ * rcoord is set to next item
+ */
+ reiser4_key rkey;
+ if (!item_is_extent(&rcoord) ||
+ !all_but_ordering_keyeq(key,
+ item_key_by_coord(&rcoord, &rkey))) {
+ zrelse(lh->node);
+ assert("edward-2386", coord->between == AFTER_UNIT);
+ return CBK_COORD_NOTFOUND;
+ }
+ coord_dup(coord, &rcoord);
+ zrelse(lh->node);
+ coord->between = AT_UNIT;
+ return CBK_COORD_FOUND;
+ }
+ zrelse(lh->node);
+ /*
+ * end of node reached. Try to find in the next node at the right
+ */
+ if (equal_to_rdk_nonprec(coord->node, key)) {
+ ret = goto_right_neighbor(coord, lh);
+ if (unlikely(ret)) {
+ done_lh(lh);
+ assert("edward-2387", ret != CBK_COORD_NOTFOUND);
+ return ret == -E_NO_NEIGHBOR ? RETERR(-EIO) : ret;
+ }
+ ret = zload(lh->node);
+ if (unlikely(ret)) {
+ done_lh(lh);
+ return ret;
+ }
+ if (unlikely(node_is_empty(coord->node))) {
+ /*
+ * for simplicity we don't go further to
+ * the right. Instead we call slow lookup.
+ */
+ zrelse(lh->node);
+ done_lh(lh);
+ goto nohint;
+ }
+ /*
+ * it is guaranteed that the first item at the
+ * right neighbor has @key, because exclusive, or
+ * non-exclusive lock of the file is held by us.
+ */
+ assert("edward-2384", equal_to_ldk_nonprec(coord->node, key));
+ zrelse(lh->node);
+ assert("edward-2388", coord->between == AT_UNIT);
+ return CBK_COORD_FOUND;
+ }
+ assert("edward-2389", coord->between == AFTER_UNIT);
+ return CBK_COORD_NOTFOUND;
+ nohint:
+ /* full-fledged lookup */
+ coord_init_zero(coord);
+ hint->ext_coord.valid = 0;
+ return find_file_item_nohint(coord, lh, key, lock_mode, inode);
+}
+
+ssize_t read_stripe(struct file *file, char __user *buf,
+ size_t read_amount, loff_t *off)
+{
+ ssize_t result;
+ struct inode *inode;
+ reiser4_context *ctx;
+ struct unix_file_info *uf_info;
+
+ if (unlikely(read_amount == 0))
+ return 0;
+
+ inode = file_inode(file);
+ assert("edward-2029", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = reserve_update_sd_common(inode);
+ if (unlikely(result != 0))
+ goto out;
+ uf_info = unix_file_inode_data(inode);
+
+ get_nonexclusive_access(uf_info);
+ result = new_sync_read(file, buf, read_amount, off);
+ drop_nonexclusive_access(uf_info);
+ out:
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+static inline size_t write_granularity(void)
+{
+ if (current_stripe_bits) {
+ int ret = 1 << (current_stripe_bits - current_blocksize_bits);
+ if (ret > DEFAULT_WRITE_GRANULARITY)
+ ret = DEFAULT_WRITE_GRANULARITY;
+ return ret;
+ } else
+ return DEFAULT_WRITE_GRANULARITY;
+}
+
+static inline ssize_t write_extent_stripe_handle_enospc(struct file *file,
+ struct inode *inode,
+ const char __user *buf,
+ size_t count,
+ loff_t *pos)
+{
+ int ret;
+ struct unix_file_info *uf_info = unix_file_inode_data(inode);
+
+ get_nonexclusive_access(uf_info);
+ ret = write_extent_stripe(file, inode, buf, count, pos, 0);
+ if (ret == -ENOSPC) {
+ drop_nonexclusive_access(uf_info);
+ txnmgr_force_commit_all(inode->i_sb, 0);
+ get_nonexclusive_access(uf_info);
+ ret = write_extent_stripe(file, inode, buf, count, pos, 0);
+ if (ret == -ENOSPC &&
+ reiser4_is_set(reiser4_get_current_sb(),
+ REISER4_PROXY_IO)) {
+ drop_nonexclusive_access(uf_info);
+ reiser4_txn_restart_current();
+ get_nonexclusive_access(uf_info);
+ ret = write_extent_stripe(file, inode, buf, count, pos,
+ UPX_PROXY_FULL);
+ if (0 && ret == -ENOSPC) {
+ drop_nonexclusive_access(uf_info);
+ txnmgr_force_commit_all(inode->i_sb, 0);
+ get_nonexclusive_access(uf_info);
+ ret = write_extent_stripe(file, inode, buf,
+ count, pos,
+ UPX_PROXY_FULL);
+ }
+ }
+ }
+ drop_nonexclusive_access(uf_info);
+ return ret;
+}
+
+ssize_t write_stripe(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *pos,
+ struct dispatch_context *cont)
+{
+ int ret;
+ reiser4_context *ctx = get_current_context();
+ struct inode *inode = file_inode(file);
+ ssize_t written = 0;
+ int to_write;
+ int chunk_size = PAGE_SIZE * write_granularity();
+ size_t left = count;
+
+ assert("edward-2030", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ ret = file_remove_privs(file);
+ if (ret) {
+ context_set_commit_async(ctx);
+ return ret;
+ }
+ /* remove_suid might create a transaction */
+ reiser4_txn_restart(ctx);
+
+ while (left) {
+ int update_sd = 0;
+ /*
+ * write not more then one logical chunk per iteration
+ */
+ to_write = chunk_size - (*pos & (chunk_size - 1));
+ if (left < to_write)
+ to_write = left;
+
+ written = write_extent_stripe_handle_enospc(file, inode, buf,
+ to_write, pos);
+ if (written < 0)
+ break;
+ /*
+ * something is written
+ */
+ if (*pos + written > inode->i_size) {
+ INODE_SET_FIELD(inode, i_size, *pos + written);
+ update_sd = 1;
+ }
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_sd = 1;
+ }
+ if (update_sd) {
+ /*
+ * space for update_sd was reserved
+ * in write_extent()
+ */
+ ret = reiser4_update_sd(inode);
+ if (ret) {
+ warning("edward-1574",
+ "Can not update stat-data: %i. FSCK?",
+ ret);
+ context_set_commit_async(ctx);
+ break;
+ }
+ }
+ /*
+ * tell VM how many pages were dirtied. Maybe number of pages
+ * which were dirty already should not be counted
+ */
+ reiser4_throttle_write(inode);
+ left -= written;
+ buf += written;
+ *pos += written;
+ }
+ if (ret == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ reiser4_txn_restart_current();
+ grab_space_enable();
+ ret = reiser4_sync_file_common(file, 0, LONG_MAX,
+ 0 /* data and stat data */);
+ if (ret)
+ warning("edward-2367", "failed to sync file %llu",
+ (unsigned long long)get_inode_oid(inode));
+ }
+ /*
+ * return number of written bytes or error code if nothing is
+ * written. Note, that it does not work correctly in case when
+ * sync_unix_file returns error
+ */
+ return (count - left) ? (count - left) : ret;
+}
+
+static inline int readpages_filler_stripe(void *data, struct page *page)
+{
+ return reiser4_readpages_filler_generic(data, page, 1);
+}
+
+int readpages_stripe(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return reiser4_readpages_generic(file, mapping, pages, nr_pages,
+ readpages_filler_stripe);
+}
+
+void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset);
+/**
+ * ->readpage() method of address space operations for striped-file plugin
+ */
+int readpage_stripe(struct file *file, struct page *page)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *inode;
+ reiser4_key key;
+ hint_t *hint;
+ lock_handle *lh;
+ coord_t *coord;
+
+ assert("vs-1062", PageLocked(page));
+ assert("vs-976", !PageUptodate(page));
+ assert("vs-1061", page->mapping && page->mapping->host);
+
+ inode = page->mapping->host;
+
+ if (inode->i_size <= page_offset(page)) {
+ /* page is out of file */
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ unlock_page(page);
+ return PTR_ERR(ctx);
+ }
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL) {
+ unlock_page(page);
+ reiser4_exit_context(ctx);
+ return RETERR(-ENOMEM);
+ }
+
+ result = load_file_hint(file, hint);
+ if (result) {
+ kfree(hint);
+ unlock_page(page);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ lh = &hint->lh;
+ /*
+ * construct key of the page's first byte
+ */
+ build_body_key_stripe(inode, page_offset(page), &key);
+ /*
+ * look for file metadata corresponding to the page's first byte
+ */
+ get_page(page);
+ unlock_page(page);
+ result = find_file_item_nohint(&hint->ext_coord.coord,
+ hint->ext_coord.lh, &key,
+ ZNODE_READ_LOCK, inode);
+ lock_page(page);
+ put_page(page);
+
+ if (page->mapping == NULL) {
+ /*
+ * readpage allows truncate to run concurrently.
+ * Page was truncated while it was not locked
+ */
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return -EINVAL;
+ }
+ if (IS_CBKERR(result)) {
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ if (PageUptodate(page)) {
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return 0;
+ }
+ coord = &hint->ext_coord.coord;
+ result = zload(coord->node);
+ if (result) {
+ done_lh(lh);
+ kfree(hint);
+ unlock_page(page);
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ validate_extended_coord(&hint->ext_coord, page_offset(page));
+
+ if (coord_is_existing_unit(coord)) {
+ result = reiser4_readpage_extent(coord, page);
+ assert("edward-2032", result == 0);
+ } else {
+ /* hole in the file */
+ result = __reiser4_readpage_extent(NULL, NULL, 0, page);
+ assert("edward-2033", result == 0);
+ }
+ if (result) {
+ unlock_page(page);
+ reiser4_unset_hint(hint);
+ } else {
+ build_body_key_stripe(inode,
+ (loff_t)(page->index + 1) << PAGE_SHIFT,
+ &key);
+ /* FIXME should call reiser4_set_hint() */
+ reiser4_unset_hint(hint);
+ }
+ assert("edward-2034",
+ ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
+ assert("edward-2035",
+ ergo(result != 0, !PageLocked(page)));
+ zrelse(coord->node);
+ done_lh(lh);
+
+ kfree(hint);
+ /*
+ * FIXME: explain why it is needed. HINT: page allocation in write can
+ * not be done when atom is not NULL because reiser4_writepage can not
+ * kick entd and have to eflush
+ */
+ reiser4_txn_restart(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+#define CUT_TREE_MIN_ITERATIONS 64
+
+int cut_tree_worker_stripe(tap_t *tap, const reiser4_key *from_key,
+ const reiser4_key *to_key,
+ reiser4_key *smallest_removed, struct inode *object,
+ int truncate, int *progress)
+{
+ int ret;
+ coord_t left_coord;
+ reiser4_key left_key;
+ reiser4_key right_key;
+ lock_handle next_node_lock;
+
+ assert("edward-2287", tap->coord->node != NULL);
+ assert("edward-2288", znode_is_write_locked(tap->coord->node));
+
+ *progress = 0;
+ init_lh(&next_node_lock);
+
+ while (1) {
+ znode *node = tap->coord->node;
+
+ ret = reiser4_get_left_neighbor(&next_node_lock, node,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (ret != 0 && ret != -E_NO_NEIGHBOR)
+ break;
+ ret = reiser4_tap_load(tap);
+ if (ret)
+ break;
+ if (*progress)
+ /* prepare right point */
+ coord_init_last_unit(tap->coord, node);
+ /* prepare left point */
+ ret = node_plugin_by_node(node)->lookup(node, from_key,
+ FIND_MAX_NOT_MORE_THAN,
+ &left_coord);
+ if (IS_CBKERR(ret))
+ break;
+ /*
+ * adjust coordinates so that they are set to existing units
+ */
+ if (coord_set_to_right(&left_coord) ||
+ coord_set_to_left(tap->coord)) {
+ ret = CBK_COORD_NOTFOUND;
+ break;
+ }
+ if (coord_compare(&left_coord, tap->coord) ==
+ COORD_CMP_ON_RIGHT) {
+ /* no keys of [from_key, @to_key] in the tree */
+ ret = CBK_COORD_NOTFOUND;
+ break;
+ }
+ /*
+ * Make keys precise.
+ * Set right_key to last byte of the item at tap->coord
+ */
+ item_key_by_coord(tap->coord, &right_key);
+ set_key_offset(&right_key,
+ get_key_offset(&right_key) +
+ reiser4_extent_size(tap->coord) - 1);
+ assert("edward-2289",
+ get_key_offset(&right_key) <= get_key_offset(to_key));
+ assert("edward-2290",
+ get_key_offset(from_key) <= get_key_offset(&right_key));
+ /*
+ * @from_key may not exist in the tree
+ */
+ unit_key_by_coord(&left_coord, &left_key);
+
+ if (get_key_offset(&left_key) < get_key_offset(from_key))
+ set_key_offset(&left_key, get_key_offset(from_key));
+
+ /* cut data from one node */
+ ret = kill_node_content(&left_coord, tap->coord,
+ &left_key, &right_key,
+ smallest_removed,
+ next_node_lock.node /* left neighbor */,
+ object, truncate);
+ reiser4_tap_relse(tap);
+ if (ret)
+ break;
+ (*progress)++;
+ if (keyle(smallest_removed, from_key))
+ break;
+ if (next_node_lock.node == NULL)
+ break;
+ ret = reiser4_tap_move(tap, &next_node_lock);
+ done_lh(&next_node_lock);
+ if (ret)
+ break;
+ /* break long truncate if atom requires commit */
+
+ if (*progress > CUT_TREE_MIN_ITERATIONS &&
+ current_atom_should_commit()) {
+ ret = -E_REPEAT;
+ break;
+ }
+ }
+ done_lh(&next_node_lock);
+ return ret;
+}
+
+/**
+ * Cut body of a striped file
+ */
+static int cut_file_items_stripe(struct inode *inode, loff_t new_size,
+ int update_sd, loff_t cur_size,
+ reiser4_key *smallest_removed,
+ int (*update_size_fn) (struct inode *,
+ loff_t, int))
+{
+ int ret = 0;
+ reiser4_key from_key, to_key;
+ reiser4_tree *tree = meta_subvol_tree();
+
+ assert("edward-2021", inode_file_plugin(inode) ==
+ file_plugin_by_id(STRIPED_FILE_PLUGIN_ID));
+
+ build_body_key_stripe(inode, cur_size - 1, &to_key);
+ from_key = to_key;
+ set_key_offset(&from_key, new_size);
+
+ while (1) {
+ int progress = 0;
+ /*
+ * this takes sbinfo->delete_mutex
+ */
+ ret = reserve_cut_iteration(inode);
+ if (ret)
+ return ret;
+
+ ret = reiser4_cut_tree_object(tree,
+ &from_key, &to_key,
+ smallest_removed, inode,
+ 1 /* truncate */, &progress);
+ assert("edward-2291", ret != -E_NO_NEIGHBOR);
+
+ if (ret == -E_REPEAT) {
+ if (progress) {
+ ret = update_size_fn(inode,
+ get_key_offset(smallest_removed),
+ update_sd);
+ if (ret)
+ break;
+ }
+ /* this releases sbinfo->delete_mutex */
+
+ reiser4_release_reserved(inode->i_sb);
+ reiser4_txn_restart_current();
+ continue;
+ } else if (ret == 0 || ret == CBK_COORD_NOTFOUND)
+ ret = update_size_fn(inode, new_size, update_sd);
+ break;
+ }
+ /* this releases sbinfo->delete_mutex */
+
+ reiser4_release_reserved(inode->i_sb);
+ return ret;
+}
+
+#if 0
+static void check_partial_page_truncate(struct inode *inode,
+ reiser4_key *smallest_removed)
+{
+ int ret;
+ reiser4_key key;
+ lock_handle lh;
+ coord_t coord;
+
+ init_lh(&lh);
+ memcpy(&key, smallest_removed, sizeof(key));
+ set_key_offset(&key, round_down(inode->i_size, PAGE_SIZE));
+
+ ret = find_file_item_nohint(&coord, &lh, &key,
+ ZNODE_READ_LOCK, inode);
+ done_lh(&lh);
+ assert("edward-2369", ret == 0);
+ assert("edward-2370", coord.between == AT_UNIT);
+}
+#endif
+
+#if REISER4_DEBUG
+static void check_truncate_jnodes(struct inode *inode, pgoff_t start)
+{
+ int ret;
+ jnode *node = NULL;
+
+ read_lock_tree();
+ ret = radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(reiser4_inode_data(inode)),
+ (void **)node, start, 1);
+ read_unlock_tree();
+ if (ret)
+ warning("edward-2467", "found jnode index=%lu, file_size=%llu",
+ index_jnode(node), inode->i_size);
+}
+#endif
+
+/**
+ * Exclusive access to the file must be acquired
+ */
+static int shorten_stripe(struct inode *inode, loff_t new_size)
+{
+ int result;
+ struct page *page;
+ int padd_from;
+ unsigned long index;
+ reiser4_key smallest_removed;
+
+ memcpy(&smallest_removed,
+ reiser4_max_key(), sizeof(smallest_removed));
+ /*
+ * cut file body
+ */
+ result = cut_file_items_stripe(inode, new_size,
+ 1, /* update_sd */
+ get_key_offset(reiser4_max_key()),
+ &smallest_removed,
+ reiser4_update_file_size);
+ if (result)
+ return result;
+ assert("vs-1105", new_size == inode->i_size);
+ /*
+ * drop all the pages that don't have jnodes (i.e. pages
+ * which can not be truncated by cut_file_items() because
+ * of holes, which are not represented by any items, so
+ * that we can't call kill hooks to truncate them like in
+ * the case of calassic unix-files
+ */
+ truncate_inode_pages(inode->i_mapping, round_up(new_size, PAGE_SIZE));
+ ON_DEBUG(check_truncate_jnodes(inode,
+ round_up(new_size, PAGE_SIZE) >> PAGE_SHIFT));
+
+ padd_from = inode->i_size & (PAGE_SIZE - 1);
+ if (!padd_from)
+ /* file is truncated to page boundary */
+ return 0;
+ if (get_key_offset(&smallest_removed) != new_size)
+ /*
+ * the cut offset is in the logical block, which is
+ * not represented by a block pointer in the tree -
+ * there is no need to handle partial page truncate
+ */
+ return 0;
+ /*
+ * Handle partial page truncate.
+ * Reserve space on meta-data brick
+ */
+ grab_space_enable();
+ result = reiser4_grab_reserved(reiser4_get_current_sb(),
+ estimate_write_stripe_meta(1),
+ BA_CAN_COMMIT,
+ get_meta_subvol());
+ if (result) {
+ assert("edward-2295",
+ get_current_super_private()->delete_mutex_owner == NULL);
+ return result;
+ }
+ /*
+ * reserve space on data brick, where the partially
+ * trunated page should be stored
+ */
+ grab_space_enable();
+ result = reiser4_grab_reserved(reiser4_get_current_sb(),
+ 1, /* count */
+ BA_CAN_COMMIT,
+ subvol_by_key(&smallest_removed));
+ if (result)
+ return result;
+ /*
+ * zero content of partially truncated page
+ */
+ index = (inode->i_size >> PAGE_SHIFT);
+
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ if (IS_ERR(page)) {
+ reiser4_release_reserved(inode->i_sb);
+ return PTR_ERR(page);
+ }
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ reiser4_release_reserved(inode->i_sb);
+ return RETERR(-EIO);
+ }
+ lock_page(page);
+ assert("edward-2036", PageLocked(page));
+ zero_user_segment(page, padd_from, PAGE_SIZE);
+ unlock_page(page);
+
+ result = find_or_create_extent_stripe(page, UPX_TRUNCATE);
+
+ reiser4_release_reserved(inode->i_sb);
+ put_page(page);
+ return result;
+}
+
+static int truncate_body_stripe(struct inode *inode, struct iattr *attr)
+{
+ loff_t new_size = attr->ia_size;
+
+ if (inode->i_size < new_size) {
+ /* expand */
+ return reiser4_update_file_size(inode, new_size, 1);
+ } else if (inode->i_size > new_size)
+ /* shrink */
+ return shorten_stripe(inode, new_size);
+ return 0;
+}
+
+int setattr_stripe(struct dentry *dentry, struct iattr *attr)
+{
+ return reiser4_setattr_generic(dentry, attr, truncate_body_stripe);
+}
+
+int delete_object_stripe(struct inode *inode)
+{
+ struct unix_file_info *uf_info;
+ int result;
+
+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
+ return 0;
+
+ /* truncate file body first */
+ uf_info = unix_file_inode_data(inode);
+
+ get_exclusive_access(uf_info);
+ result = shorten_stripe(inode, 0 /* new size */);
+ drop_exclusive_access(uf_info);
+
+ if (unlikely(result != 0))
+ warning("edward-2037",
+ "failed to truncate striped file (%llu) on removal: %d",
+ get_inode_oid(inode), result);
+
+ /* remove stat data and safe link */
+ return reiser4_delete_object_common(inode);
+}
+
+int create_object_stripe(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *data, oid_t *oid)
+{
+ reiser4_inode *info;
+
+ assert("edward-2038", object != NULL);
+ assert("edward-2039", parent != NULL);
+ assert("edward-2040", data != NULL);
+ assert("edward-2041", reiser4_inode_get_flag(object, REISER4_NO_SD));
+ assert("edward-2042", data->id == STRIPED_FILE_PLUGIN_ID);
+
+ info = reiser4_inode_data(object);
+
+ assert("edward-2043", info != NULL);
+ /*
+ * Since striped file plugin is not default, we
+ * need to store its id in stat-data extention
+ */
+ info->plugin_mask |= (1 << PSET_FILE);
+
+ return write_sd_by_inode_common(object, oid);
+}
+
+int open_stripe(struct inode *inode, struct file *file)
+{
+ /*
+ * nothing to do at open time
+ */
+ return 0;
+}
+
+int release_stripe(struct inode *inode, struct file *file)
+{
+ reiser4_free_file_fsdata(file);
+ return 0;
+}
+
+/**
+ * Capture one anonymous page.
+ * Exclusive, or non-exclusive access to the file must be acquired.
+ */
+static int capture_anon_page(struct page *page)
+{
+ int ret;
+ struct inode *inode;
+
+ if (PageWriteback(page))
+ /*
+ * FIXME: do nothing?
+ */
+ return 0;
+ assert("edward-2044", page->mapping && page->mapping->host);
+
+ inode = page->mapping->host;
+
+ assert("edward-2045", inode->i_size > page_offset(page));
+ /*
+ * reserve space on meta-data brick
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(estimate_write_stripe_meta(1),
+ 0, /* flags */
+ get_meta_subvol() /* where */);
+ if (ret)
+ return ret;
+ ret = find_or_create_extent_stripe(page, 0);
+ if (ret == -ENOSPC &&
+ reiser4_is_set(reiser4_get_current_sb(), REISER4_PROXY_IO))
+ ret = find_or_create_extent_stripe(page, UPX_PROXY_FULL);
+ if (ret) {
+ SetPageError(page);
+ warning("edward-2046",
+ "Failed to capture anon page of striped file: %i", ret);
+ } else
+ ret = 1;
+ return ret;
+}
+
+int sync_jnode(jnode *node)
+{
+ int result;
+
+ assert("edward-2452", node != NULL);
+ assert("edward-2453", get_current_context() != NULL);
+ assert("edward-2454", get_current_context()->trans != NULL);
+
+ do {
+ txn_atom *atom;
+
+ spin_lock_jnode(node);
+ atom = jnode_get_atom(node);
+ spin_unlock_jnode(node);
+ result = reiser4_sync_atom(atom);
+
+ } while (result == -E_REPEAT);
+
+ assert("edward-2455",
+ ergo(result == 0,
+ get_current_context()->trans->atom == NULL));
+ return result;
+}
+
+int sync_jnode_list(struct inode *inode)
+{
+ int result = 0;
+ unsigned long from; /* start index for radix_tree_gang_lookup */
+ unsigned int found; /* return value for radix_tree_gang_lookup */
+
+ from = 0;
+ read_lock_tree();
+ while (result == 0) {
+ jnode *node = NULL;
+
+ found = radix_tree_gang_lookup(jnode_tree_by_inode(inode),
+ (void **)&node, from, 1);
+ if (found == 0)
+ break;
+ assert("edward-2456", node != NULL);
+ /**
+ * node may not leave radix tree because it is
+ * protected from truncating by exclusive lock
+ */
+ jref(node);
+ read_unlock_tree();
+
+ from = node->key.j.index + 1;
+
+ result = sync_jnode(node);
+
+ jput(node);
+ read_lock_tree();
+ }
+ read_unlock_tree();
+ return result;
+}
+
+static int commit_stripe_atoms(struct inode *inode)
+{
+ int ret;
+
+ reiser4_txn_restart_current();
+ ret =
+ /*
+ * when we are called by
+ * filemap_fdatawrite->
+ * do_writepages()->
+ * reiser4_writepages_dispatch()
+ *
+ * inode->i_mapping->dirty_pages are spices into
+ * ->io_pages, leaving ->dirty_pages dirty.
+ *
+ * When we are called from
+ * reiser4_fsync()->sync_unix_file(), we have to
+ * commit atoms of all pages on the ->dirty_list.
+ *
+ * So for simplicity we just commit ->io_pages and
+ * ->dirty_pages.
+ */
+ reiser4_sync_page_list(inode);
+ /*
+ * commit current transaction: there can be captured nodes from
+ * find_file_state() and finish_conversion().
+ */
+ reiser4_txn_restart_current();
+ return ret;
+}
+
+int writepages_stripe(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return reiser4_writepages_generic(mapping, wbc,
+ capture_anon_page,
+ commit_stripe_atoms);
+}
+
+int ioctl_stripe(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ return reiser4_ioctl_volume(filp, cmd, arg, reiser4_volume_op_file);
+}
+
+/**
+ * implementation of ->write_begin() address space operation
+ * for striped-file plugin
+ */
+int write_begin_stripe(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata)
+{
+ int ret;
+ /*
+ * Reserve space on meta-data brick.
+ * In particular, it is needed to "drill" the leaf level
+ * by search procedure.
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(estimate_write_stripe_meta(1),
+ 0, /* flags */
+ get_meta_subvol() /* where */);
+ if (ret)
+ return ret;
+
+ get_nonexclusive_access(unix_file_inode_data(file_inode(file)));
+ ret = reiser4_write_begin_common(file, page, pos, len,
+ readpage_stripe);
+ if (unlikely(ret != 0))
+ drop_nonexclusive_access(unix_file_inode_data(file_inode(file)));
+ return ret;
+}
+
+/**
+ * Implementation of ->write_end() address space operation
+ * for striped-file plugin
+ */
+int write_end_stripe(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata)
+{
+ int ret;
+ struct inode *inode;
+ struct unix_file_info *info;
+
+ inode = file_inode(file);
+ info = unix_file_inode_data(inode);
+
+ unlock_page(page);
+ ret = find_or_create_extent_stripe(page, 0);
+ if (ret == -ENOSPC && reiser4_is_set(reiser4_get_current_sb(),
+ REISER4_PROXY_IO))
+ ret = find_or_create_extent_stripe(page, UPX_PROXY_FULL);
+ if (ret) {
+ SetPageError(page);
+ goto exit;
+ }
+ if (pos + copied > inode->i_size) {
+ INODE_SET_FIELD(inode, i_size, pos + copied);
+ ret = reiser4_update_sd(inode);
+ if (unlikely(ret != 0))
+ warning("edward-2431",
+ "Can not update stat-data: %i. FSCK?",
+ ret);
+ }
+ exit:
+ drop_nonexclusive_access(unix_file_inode_data(file_inode(file)));
+ return ret;
+}
+
+/**
+ * Migrate data blocks of a regular file specified by @inode
+ * Exclusive access to the file should be acquired by caller.
+ *
+ * Implementation details:
+ * Scan file body from right to left, read all pages which should
+ * get location on other bricks, and make them dirty. In flush time
+ * those pages will get disk addresses on the new bricks.
+ *
+ * IMPORTANT: This implementation assumes that logical order on
+ * the file coincides with the physical order.
+ */
+static int __migrate_stripe(struct inode *inode, u64 *dst_id)
+{
+ int ret;
+ reiser4_key key; /* search key */
+ reiser4_key ikey; /* item key */
+ struct unix_file_info *uf;
+ coord_t coord;
+ lock_handle lh;
+ item_plugin *iplug;
+
+ /*
+ * commit all file atoms before migration!
+ */
+ reiser4_txn_restart_current();
+ ret = sync_jnode_list(inode);
+ reiser4_txn_restart_current();
+ if (ret)
+ return ret;
+ all_grabbed2free();
+ /*
+ * Reserve space for the first iteration of the migration
+ * procedure. We grab from reserved area, as rebalancing can
+ * be launched on a volume with no free space.
+ */
+ ret = reserve_migration_iter();
+ if (ret)
+ return ret;
+ uf = unix_file_inode_data(inode);
+
+ reiser4_inode_set_flag(inode, REISER4_FILE_IN_MIGRATION);
+
+ build_body_key_stripe(inode, get_key_offset(reiser4_max_key()),
+ &key);
+ while (1) {
+ znode *loaded;
+ loff_t done_off;
+
+ init_lh(&lh);
+ ret = coord_by_key(meta_subvol_tree(), &key,
+ &coord, &lh, ZNODE_WRITE_LOCK,
+ FIND_MAX_NOT_MORE_THAN,
+ TWIG_LEVEL, TWIG_LEVEL,
+ CBK_UNIQUE, NULL);
+ if (IS_CBKERR(ret)) {
+ done_lh(&lh);
+ reiser4_release_reserved(inode->i_sb);
+ return ret;
+ }
+ ret = zload(coord.node);
+ if (ret) {
+ done_lh(&lh);
+ reiser4_release_reserved(inode->i_sb);
+ return ret;
+ }
+ loaded = coord.node;
+
+ coord_set_to_left(&coord);
+ if (!coord_is_existing_item(&coord)) {
+ /*
+ * nothing to migrate
+ */
+ zrelse(loaded);
+ goto done;
+ }
+ /*
+ * check that found item belongs to the file
+ */
+ if (!inode_file_plugin(inode)->owns_item(inode, &coord)) {
+ zrelse(loaded);
+ goto done;
+ }
+ item_key_by_coord(&coord, &ikey);
+ iplug = item_plugin_by_coord(&coord);
+ assert("edward-2349", iplug->v.migrate != NULL);
+ zrelse(loaded);
+ /*
+ * Migrate data blocks (from right to left) pointed
+ * out by the found extent item at @coord.
+ * On success (ret == 0 || ret == -E_REPEAT) at least
+ * one of the mentioned blocks has to be migrated. In
+ * this case @done_off contains offset of the leftmost
+ * migrated byte
+ */
+ ret = iplug->v.migrate(&coord, &ikey, &lh, inode, &done_off,
+ dst_id);
+ done_lh(&lh);
+ reiser4_release_reserved(inode->i_sb);
+ if (ret && ret != -E_REPEAT)
+ return ret;
+ if (done_off == 0)
+ /* nothing to migrate any more */
+ break;
+ /*
+ * look for the item, which points out to the
+ * rightmost not processed block
+ */
+ set_key_offset(&key, done_off - 1);
+ }
+ done:
+ /*
+ * The whole file has been successfully migrated.
+ * Clean up unbalanced status
+ */
+ assert("edward-2104", reiser4_lock_counters()->d_refs == 0);
+ done_lh(&lh);
+ reiser4_inode_clr_flag(inode, REISER4_FILE_IN_MIGRATION);
+ return 0;
+}
+
+int migrate_stripe(struct inode *inode, u64 *dst_id)
+{
+ int ret;
+
+ get_exclusive_access(unix_file_inode_data(inode));
+ ret = __migrate_stripe(inode, dst_id);
+ drop_exclusive_access(unix_file_inode_data(inode));
+ return ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/symfile.c linux-5.10.2/fs/reiser4/plugin/file/symfile.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/symfile.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/symfile.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,87 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Symfiles are a generalization of Unix symlinks.
+
+ A symfile when read behaves as though you took its contents and
+ substituted them into the reiser4 naming system as the right hand side
+ of an assignment, and then read that which you had assigned to it.
+
+ A key issue for symfiles is how to implement writes through to
+ subfiles. In general, one must have some method of determining what
+ of that which is written to the symfile is written to what subfile.
+ This can be done by use of custom plugin methods written by users, or
+ by using a few general methods we provide for those willing to endure
+ the insertion of delimiters into what is read.
+
+ Writing to symfiles without delimiters to denote what is written to
+ what subfile is not supported by any plugins we provide in this
+ release. Our most sophisticated support for writes is that embodied
+ by the invert plugin (see invert.c).
+
+ A read only version of the /etc/passwd file might be
+ constructed as a symfile whose contents are as follows:
+
+ /etc/passwd/userlines/*
+
+ or
+
+ /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
+
+ or
+
+ /etc/passwd/userlines/(demidov+edward+reiser+root)
+
+ A symfile with contents
+
+ /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
+
+ will return when read
+
+ The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
+
+ and write of what has been read will not be possible to implement as
+ an identity operation because there are no delimiters denoting the
+ boundaries of what is to be written to what subfile.
+
+ Note that one could make this a read/write symfile if one specified
+ delimiters, and the write method understood those delimiters delimited
+ what was written to subfiles.
+
+ So, specifying the symfile in a manner that allows writes:
+
+ /etc/passwd/userlines/demidov+"(
+ )+/etc/passwd/userlines/edward+"(
+ )+/etc/passwd/userlines/reiser+"(
+ )+/etc/passwd/userlines/root+"(
+ )
+
+ or
+
+ /etc/passwd/userlines/(demidov+"(
+ )+edward+"(
+ )+reiser+"(
+ )+root+"(
+ ))
+
+ and the file demidov might be specified as:
+
+ /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
+
+ or
+
+ /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
+
+ Notice that if the file demidov has a carriage return in it, the
+ parsing fails, but then if you put carriage returns in the wrong place
+ in a normal /etc/passwd file it breaks things also.
+
+ Note that it is forbidden to have no text between two interpolations
+ if one wants to be able to define what parts of a write go to what
+ subfiles referenced in an interpolation.
+
+ If one wants to be able to add new lines by writing to the file, one
+ must either write a custom plugin for /etc/passwd that knows how to
+ name an added line, or one must use an invert, or one must use a more
+ sophisticated symfile syntax that we are not planning to write for
+ version 4.0.
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/symlink.c linux-5.10.2/fs/reiser4/plugin/file/symlink.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/symlink.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/symlink.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,92 @@
+/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../inode.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/* file plugin methods specific for symlink files
+ (SYMLINK_FILE_PLUGIN_ID) */
+
+/* this is implementation of create_object method of file plugin for
+ SYMLINK_FILE_PLUGIN_ID
+ */
+
+/**
+ * reiser4_create_symlink - create a synlink (object managed by
+ * file plugin with SYMLINK_FILE_PLUGIN_ID.
+ * Inserts stat data with symlink extension where into the tree.
+ *
+ * @symlink: inode of symlink object
+ * @dir: inode of parent directory
+ * @data: parameters of new object (in particular, filled by reiser4() syscall)
+ */
+int reiser4_create_symlink(struct inode *symlink,
+ struct inode *dir UNUSED_ARG,
+ reiser4_object_create_data *data, oid_t *oid)
+{
+ int result;
+
+ assert("nikita-680", symlink != NULL);
+ assert("nikita-681", S_ISLNK(symlink->i_mode));
+ assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
+ assert("nikita-682", dir != NULL);
+ assert("nikita-684", data != NULL);
+ assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
+ /*
+ * stat data of symlink has symlink extension in which we store
+ * symlink content, that is, path symlink is pointing to.
+ */
+ reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
+
+ assert("vs-838", symlink->i_private == NULL);
+ symlink->i_private = (void *)data->name;
+
+ assert("vs-843", symlink->i_size == 0);
+ INODE_SET_FIELD(symlink, i_size, strlen(data->name));
+
+ /* insert stat data appended with data->name */
+ result = inode_file_plugin(symlink)->write_sd_by_inode(symlink, oid);
+ if (result) {
+ /* FIXME-VS: Make sure that symlink->i_private is not attached
+ to kmalloced data */
+ INODE_SET_FIELD(symlink, i_size, 0);
+ } else {
+ assert("vs-849", symlink->i_private
+ && reiser4_inode_get_flag(symlink,
+ REISER4_GENERIC_PTR_USED));
+ assert("vs-850",
+ !memcmp((char *)symlink->i_private, data->name,
+ (size_t) symlink->i_size + 1));
+ }
+ return result;
+}
+
+/* this is implementation of destroy_inode method of file plugin for
+ SYMLINK_FILE_PLUGIN_ID
+ */
+void destroy_inode_symlink(struct inode *inode)
+{
+ assert("edward-799",
+ inode_file_plugin(inode) ==
+ file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
+ assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
+ assert("edward-801", reiser4_inode_get_flag(inode,
+ REISER4_GENERIC_PTR_USED));
+ assert("vs-839", S_ISLNK(inode->i_mode));
+
+ kfree(inode->i_private);
+ inode->i_private = NULL;
+ reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file/tail_conversion.c linux-5.10.2/fs/reiser4/plugin/file/tail_conversion.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file/tail_conversion.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file/tail_conversion.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,810 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../inode.h"
+#include "../../super.h"
+#include "../../page_cache.h"
+#include "../../carry.h"
+#include "../../safe_link.h"
+#include "../../vfs_ops.h"
+
+#include <linux/writeback.h>
+
+/* this file contains:
+ tail2extent and extent2tail */
+
+/**
+ * exclusive access to a file is acquired when file state changes:
+ * tail2extent, empty2tail, extent2tail, etc
+ */
+void get_exclusive_access(struct unix_file_info * uf_info)
+{
+ assert("nikita-3028", reiser4_schedulable());
+ assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
+ assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
+ /*
+ * "deadlock avoidance": sometimes we commit a transaction under
+ * rw-semaphore on a file. Such commit can deadlock with another
+ * thread that captured some block (hence preventing atom from being
+ * committed) and waits on rw-semaphore.
+ */
+ reiser4_txn_restart_current();
+ LOCK_CNT_INC(inode_sem_w);
+ down_write(&uf_info->latch);
+ uf_info->exclusive_use = 1;
+ assert("vs-1713", uf_info->ea_owner == NULL);
+ assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
+ ON_DEBUG(uf_info->ea_owner = current);
+}
+
+void drop_exclusive_access(struct unix_file_info * uf_info)
+{
+ assert("vs-1714", uf_info->ea_owner == current);
+ assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
+ ON_DEBUG(uf_info->ea_owner = NULL);
+ uf_info->exclusive_use = 0;
+ up_write(&uf_info->latch);
+ assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
+ assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
+ LOCK_CNT_DEC(inode_sem_w);
+ reiser4_txn_restart_current();
+}
+
+/**
+ * nea_grabbed - do something when file semaphore is down_read-ed
+ * @uf_info:
+ *
+ * This is called when nonexclisive access is obtained on file. All it does is
+ * for debugging purposes.
+ */
+static void nea_grabbed(struct unix_file_info *uf_info)
+{
+#if REISER4_DEBUG
+ LOCK_CNT_INC(inode_sem_r);
+ assert("vs-1716", uf_info->ea_owner == NULL);
+ atomic_inc(&uf_info->nr_neas);
+ uf_info->last_reader = current;
+#endif
+}
+
+/**
+ * get_nonexclusive_access - get nonexclusive access to a file
+ * @uf_info: unix file specific part of inode to obtain access to
+ *
+ * Nonexclusive access is obtained on a file before read, write, readpage.
+ */
+void get_nonexclusive_access(struct unix_file_info *uf_info)
+{
+ assert("nikita-3029", reiser4_schedulable());
+ assert("nikita-3361", get_current_context()->trans->atom == NULL);
+
+ down_read(&uf_info->latch);
+ nea_grabbed(uf_info);
+}
+
+/**
+ * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
+ * @uf_info: unix file specific part of inode to obtain access to
+ *
+ * Non-blocking version of nonexclusive access obtaining.
+ */
+int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
+{
+ int result;
+
+ result = down_read_trylock(&uf_info->latch);
+ if (result)
+ nea_grabbed(uf_info);
+ return result;
+}
+
+void drop_nonexclusive_access(struct unix_file_info * uf_info)
+{
+ assert("vs-1718", uf_info->ea_owner == NULL);
+ assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
+ ON_DEBUG(atomic_dec(&uf_info->nr_neas));
+
+ up_read(&uf_info->latch);
+
+ LOCK_CNT_DEC(inode_sem_r);
+ reiser4_txn_restart_current();
+}
+
+/**
+ * part of tail2extent.
+ * Cut all items covering @count bytes starting from @offset
+ */
+static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
+{
+ reiser4_key from, to;
+ /*
+ * AUDIT: How about putting an assertion here, what would check
+ * all provided range is covered by tail items only?
+ */
+ /*
+ * key of first byte in the range to be cut
+ */
+ build_body_key_unix_file(inode, offset, &from);
+ /*
+ * key of last byte in that range
+ */
+ to = from;
+ set_key_offset(&to, (__u64) (offset + count - 1));
+ /*
+ * cut everything between those keys
+ */
+ return reiser4_cut_tree(meta_subvol_tree(), &from, &to,
+ inode, 0);
+}
+
+static void release_all_pages(struct page **pages, unsigned nr_pages)
+{
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i] == NULL) {
+#if REISER4_DEBUG
+ unsigned j;
+ for (j = i + 1; j < nr_pages; j++)
+ assert("vs-1620", pages[j] == NULL);
+#endif
+ break;
+ }
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+}
+
+/**
+ * Part of tail2extent. replace tail items with extent one.
+ * Content of tail items (@count bytes) being cut are copied already into
+ * pages. extent_writepage method is called to create extents corresponding
+ * to those pages
+ */
+static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
+{
+ int result;
+ unsigned i;
+ STORE_COUNTERS;
+
+ if (nr_pages == 0)
+ return 0;
+
+ assert("vs-596", pages[0]);
+ /*
+ * cut copied items
+ */
+ result = cut_formatting_items(inode, page_offset(pages[0]), count);
+ if (result)
+ return result;
+
+ CHECK_COUNTERS;
+ /*
+ * put into tree replacement for just removed items: extent item, namely
+ */
+ for (i = 0; i < nr_pages; i++) {
+ result = add_to_page_cache_lru(pages[i], inode->i_mapping,
+ pages[i]->index,
+ mapping_gfp_mask(inode->
+ i_mapping));
+ if (result)
+ break;
+ SetPageUptodate(pages[i]);
+ set_page_dirty_notag(pages[i]);
+ unlock_page(pages[i]);
+ result = find_or_create_extent_unix_file(pages[i]);
+ if (result) {
+ /*
+ * Unsuccess in critical place:
+ * tail has been removed,
+ * but extent hasn't been created
+ */
+ warning("edward-1572",
+ "Report the error code %i to developers. Run FSCK",
+ result);
+ break;
+ }
+ }
+ return result;
+}
+
+#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
+ * items */
+
+/**
+ * @offset - offset of portion of data to be converted to extent
+ */
+static int reserve_tail2extent_iteration(struct inode *inode, loff_t offset)
+{
+ int ret;
+ reiser4_subvol *subv = get_meta_subvol();
+ /*
+ * space required for one iteration of extent->tail conversion:
+ *
+ * 1. kill N tail items
+ *
+ * 2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
+ *
+ * 3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
+ * extents) extent units.
+ *
+ * 4. drilling to the leaf level by coord_by_key() - see
+ * comment in the carry_extent()
+ *
+ * 5. possible update of stat-data
+ *
+ * reserve for 2 on data subvolume
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(TAIL2EXTENT_PAGE_NUM, BA_CAN_COMMIT, subv);
+ if (ret)
+ return ret;
+ /*
+ * reserve for 1,3,4,5 on meta-data subvolume
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(2 * subv->tree.height +
+ TAIL2EXTENT_PAGE_NUM *
+ estimate_one_insert_into_item(&subv->tree) +
+ 1 + estimate_one_insert_item(&subv->tree) +
+ inode_file_plugin(inode)->estimate.update(inode),
+ BA_CAN_COMMIT,
+ subv);
+ return ret;
+}
+
+/**
+ * Clear stat data's flag indicating that conversion is not completed
+ * and update stat-data
+ */
+static int complete_conversion(struct inode *inode)
+{
+ int result;
+
+ grab_space_enable();
+ result =
+ reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (result) {
+ warning("vs-1696", "Failed to clear converting bit of %llu: %i",
+ (unsigned long long)get_inode_oid(inode), result);
+ return result;
+ }
+ reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
+ return reiser4_update_sd(inode);
+}
+
+/**
+ * find_start
+ * @inode:
+ * @id:
+ * @offset:
+ *
+ * this is used by tail2extent and extent2tail to detect where previous
+ * uncompleted conversion stopped
+ */
+static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
+{
+ int result;
+ lock_handle lh;
+ coord_t coord;
+ struct unix_file_info *ufo;
+ int found;
+ reiser4_key key;
+
+ ufo = unix_file_inode_data(inode);
+ init_lh(&lh);
+ result = 0;
+ found = 0;
+ build_body_key_unix_file(inode, *offset, &key);
+ do {
+ init_lh(&lh);
+ result = find_file_item_nohint(&coord, &lh, &key,
+ ZNODE_READ_LOCK, inode);
+
+ if (result == CBK_COORD_FOUND) {
+ if (coord.between == AT_UNIT) {
+ /*coord_clear_iplug(&coord); */
+ result = zload(coord.node);
+ if (result == 0) {
+ if (item_id_by_coord(&coord) == id)
+ found = 1;
+ else
+ item_plugin_by_coord(&coord)->s.
+ file.append_key(&coord,
+ &key);
+ zrelse(coord.node);
+ }
+ } else
+ result = RETERR(-ENOENT);
+ }
+ done_lh(&lh);
+ } while (result == 0 && !found);
+ *offset = get_key_offset(&key);
+ return result;
+}
+
+int tail2extent(struct unix_file_info *uf_info)
+{
+ int result;
+ reiser4_key key; /* key of next byte to be moved to page */
+ char *p_data; /* data of page */
+ unsigned page_off = 0, /* offset within the page where to copy data */
+ count; /* number of bytes of item which can be
+ * copied to page */
+ struct page *pages[TAIL2EXTENT_PAGE_NUM];
+ struct page *page;
+ int done; /* set to 1 when all file is read */
+ char *item;
+ int i;
+ struct inode *inode;
+ int first_iteration;
+ int bytes;
+ __u64 offset;
+
+ assert("nikita-3362", ea_obtained(uf_info));
+ inode = unix_file_info_to_inode(uf_info);
+ assert("nikita-3412", !IS_RDONLY(inode));
+ assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
+
+ offset = 0;
+ first_iteration = 1;
+ result = 0;
+ get_current_super_private()->ctx = get_current_context();
+
+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
+ /*
+ * file is marked on disk as there was a conversion which did
+ * not complete due to either crash or some error. Find which
+ * offset tail conversion stopped at
+ */
+ result = find_start(inode, FORMATTING_ID, &offset);
+ if (result == -ENOENT) {
+ /*
+ * no tail items found, everything is converted
+ */
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ complete_conversion(inode);
+ return 0;
+ } else if (result != 0)
+ /*
+ * some other error
+ */
+ return result;
+ first_iteration = 0;
+ }
+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
+ /*
+ * get key of first byte of a file
+ */
+ build_body_key_unix_file(inode, offset, &key);
+
+ done = 0;
+ while (done == 0) {
+ memset(pages, 0, sizeof(pages));
+ result = reserve_tail2extent_iteration(inode,
+ get_key_offset(&key));
+ if (result != 0) {
+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
+ goto out;
+ }
+ if (first_iteration) {
+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
+ reiser4_update_sd(inode);
+ first_iteration = 0;
+ }
+ bytes = 0;
+ for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
+ assert("vs-598",
+ (get_key_offset(&key) & ~PAGE_MASK) == 0);
+ page = alloc_page(reiser4_ctx_gfp_mask_get());
+ if (!page) {
+ result = RETERR(-ENOMEM);
+ goto error;
+ }
+ page->index =
+ (unsigned long)(get_key_offset(&key) >>
+ PAGE_SHIFT);
+ /*
+ * usually when one is going to longterm lock znode (as
+ * find_file_item does, for instance) he must not hold
+ * locked pages. However, there is an exception for
+ * case tail2extent. Pages appearing here are not
+ * reachable to everyone else, they are clean, they do
+ * not have jnodes attached so keeping them locked do
+ * not risk deadlock appearance
+ */
+ assert("vs-983", !PagePrivate(page));
+ reiser4_invalidate_pages(inode->i_mapping, page->index,
+ 1, 0);
+ for (page_off = 0; page_off < PAGE_SIZE;) {
+ coord_t coord;
+ lock_handle lh;
+ /*
+ * get next item
+ * FIXME: we might want to readahead here
+ */
+ init_lh(&lh);
+ result = find_file_item_nohint(&coord,
+ &lh, &key,
+ ZNODE_READ_LOCK,
+ inode);
+ if (result != CBK_COORD_FOUND) {
+ /*
+ * error happened of not items of file
+ * were found
+ */
+ done_lh(&lh);
+ put_page(page);
+ goto error;
+ }
+ if (coord.between == AFTER_UNIT) {
+ /*
+ * end of file is reached. Padd page
+ * with zeros
+ */
+ done_lh(&lh);
+ done = 1;
+ p_data = kmap_atomic(page);
+ memset(p_data + page_off, 0,
+ PAGE_SIZE - page_off);
+ kunmap_atomic(p_data);
+ break;
+ }
+ result = zload(coord.node);
+ if (result) {
+ put_page(page);
+ done_lh(&lh);
+ goto error;
+ }
+ assert("vs-856", coord.between == AT_UNIT);
+ item = ((char *)item_body_by_coord(&coord)) +
+ coord.unit_pos;
+ /*
+ * how many bytes to copy
+ */
+ count = item_length_by_coord(&coord) -
+ coord.unit_pos;
+ /*
+ * limit length of copy to end of page
+ */
+ if (count > PAGE_SIZE - page_off)
+ count = PAGE_SIZE - page_off;
+ /*
+ * copy item (as much as will fit starting from
+ * the beginning of the item) into the page
+ */
+ p_data = kmap_atomic(page);
+ memcpy(p_data + page_off, item, count);
+ kunmap_atomic(p_data);
+
+ page_off += count;
+ bytes += count;
+ set_key_offset(&key,
+ get_key_offset(&key) + count);
+
+ zrelse(coord.node);
+ done_lh(&lh);
+ /*
+ * end of loop which fills one page by
+ * content of formatting items
+ */
+ }
+ if (page_off) {
+ /*
+ * something was copied into page
+ */
+ pages[i] = page;
+ } else {
+ put_page(page);
+ assert("vs-1648", done == 1);
+ break;
+ }
+ /* end of loop through pages of one conversion iteration */
+ }
+ if (i > 0) {
+ result = replace(inode, pages, i, bytes);
+ release_all_pages(pages, sizeof_array(pages));
+ if (result)
+ goto error;
+ /*
+ * We have to drop exclusive access to avoid deadlock
+ * which may happen because called by reiser4_writepages
+ * capture_unix_file requires to get non-exclusive
+ * access to a file. It is safe to drop EA in the middle
+ * of tail2extent conversion because write_unix_file,
+ * setattr_unix_file(truncate), mmap_unix_file,
+ * release_unix_file(extent2tail) checks if conversion
+ * is not in progress (see comments before
+ * get_exclusive_access_careful().
+ * Other processes that acquire non-exclusive access
+ * (read_unix_file, reiser4_writepages, etc) should work
+ * on partially converted files.
+ */
+ drop_exclusive_access(uf_info);
+ /*
+ * throttle the conversion
+ */
+ reiser4_throttle_write(inode);
+ get_exclusive_access(uf_info);
+ /*
+ * nobody is allowed to complete conversion but a
+ * process which started it
+ */
+ assert("", reiser4_inode_get_flag(inode,
+ REISER4_PART_MIXED));
+ }
+ }
+ if (result == 0) {
+ /*
+ * file is converted to extent items
+ */
+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
+ assert("vs-1697", reiser4_inode_get_flag(inode,
+ REISER4_PART_MIXED));
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ complete_conversion(inode);
+ } else {
+ /*
+ * conversion is not complete. Inode was already marked as
+ * REISER4_PART_MIXED and stat-data were updated at the first
+ * iteration of the loop above.
+ */
+ error:
+ release_all_pages(pages, sizeof_array(pages));
+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
+ warning("edward-1548", "Partial conversion of %llu: %i",
+ (unsigned long long)get_inode_oid(inode), result);
+ }
+ out:
+ /*
+ * this flag should be cleared, otherwise get_exclusive_access_careful()
+ * will fall into infinite loop
+ */
+ assert("edward-1549", !reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV));
+ return result;
+}
+
+static int reserve_extent2tail_iteration(struct inode *inode)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+ reiser4_tree *tree = &subv->tree;
+ /*
+ * reserve blocks for (in this order):
+ *
+ * 1. removal of extent item
+ *
+ * 2. insertion of tail by insert_flow()
+ *
+ * 3. drilling to the leaf level by coord_by_key()
+ *
+ * 4. possible update of stat-data
+ */
+ grab_space_enable();
+ return reiser4_grab_space(estimate_one_item_removal(tree) +
+ estimate_insert_flow(tree->height) +
+ 1 + estimate_one_insert_item(tree) +
+ inode_file_plugin(inode)->estimate.update(inode),
+ BA_CAN_COMMIT, subv);
+}
+
+/**
+ * for every page of file: read page, cut part of extent pointing to this page,
+ * put data of page tree by tail item
+ */
+int extent2tail(struct file * file, struct unix_file_info *uf_info)
+{
+ int result;
+ struct inode *inode;
+ struct page *page;
+ unsigned long num_pages, i;
+ unsigned long start_page;
+ reiser4_key from;
+ reiser4_key to;
+ unsigned count;
+ __u64 offset;
+
+ assert("nikita-3362", ea_obtained(uf_info));
+ inode = unix_file_info_to_inode(uf_info);
+ assert("nikita-3412", !IS_RDONLY(inode));
+ assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
+ assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
+
+ offset = 0;
+ if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
+ /*
+ * file is marked on disk as there was a conversion which did
+ * not complete due to either crash or some error. Find which
+ * offset tail conversion stopped at
+ */
+ result = find_start(inode, EXTENT40_POINTER_ID, &offset);
+ if (result == -ENOENT) {
+ /*
+ * no extent found, everything is converted
+ */
+ uf_info->container = UF_CONTAINER_TAILS;
+ complete_conversion(inode);
+ return 0;
+ } else if (result != 0)
+ /*
+ * some other error
+ */
+ return result;
+ }
+ reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
+ /*
+ * number of pages in the file
+ */
+ num_pages =
+ (inode->i_size + - offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start_page = offset >> PAGE_SHIFT;
+
+ build_body_key_unix_file(inode, offset, &from);
+ to = from;
+
+ result = 0;
+ for (i = 0; i < num_pages; i++) {
+ __u64 start_byte;
+
+ result = reserve_extent2tail_iteration(inode);
+ if (result != 0)
+ break;
+ if (i == 0 && offset == 0) {
+ reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
+ reiser4_update_sd(inode);
+ }
+ page = read_mapping_page(inode->i_mapping,
+ (unsigned)(i + start_page), NULL);
+ if (IS_ERR(page)) {
+ result = PTR_ERR(page);
+ warning("edward-1569",
+ "Can not read page %lu of %lu: %i",
+ i, num_pages, result);
+ break;
+ }
+ wait_on_page_locked(page);
+
+ if (!PageUptodate(page)) {
+ put_page(page);
+ result = RETERR(-EIO);
+ break;
+ }
+ /*
+ * cut part of file we have read
+ */
+ start_byte = (__u64) ((i + start_page) << PAGE_SHIFT);
+ set_key_offset(&from, start_byte);
+ set_key_offset(&to, start_byte + PAGE_SIZE - 1);
+ /*
+ * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
+ * commits during over-long truncates. But
+ * extent->tail conversion should be performed in one
+ * transaction.
+ */
+ result = reiser4_cut_tree(meta_subvol_tree(),
+ &from, &to, inode, 0);
+ if (result) {
+ put_page(page);
+ warning("edward-1570",
+ "Can not delete converted chunk: %i",
+ result);
+ break;
+ }
+ /*
+ * put page data into tree via tail_write
+ */
+ count = PAGE_SIZE;
+ if ((i == (num_pages - 1)) &&
+ (inode->i_size & ~PAGE_MASK))
+ /*
+ * last page can be incompleted
+ */
+ count = (inode->i_size & ~PAGE_MASK);
+ while (count) {
+ loff_t pos = start_byte;
+
+ assert("edward-1537",
+ file != NULL && file->f_path.dentry != NULL);
+ assert("edward-1538",
+ file_inode(file) == inode);
+
+ result = write_tail_noreserve(file, inode,
+ (char __user *)kmap(page),
+ count, &pos);
+ kunmap(page);
+ /*
+ * FIXME: may be put_file_hint() instead ?
+ */
+ reiser4_free_file_fsdata(file);
+ if (result <= 0) {
+ /*
+ * Unsuccess in critical place:
+ * extent has been removed,
+ * but tail hasn't been created
+ */
+ warning("edward-1571",
+ "Report the error code %i to developers. Run FSCK",
+ result);
+ put_page(page);
+ reiser4_inode_clr_flag(inode,
+ REISER4_PART_IN_CONV);
+ return result;
+ }
+ count -= result;
+ }
+ /*
+ * release page
+ */
+ lock_page(page);
+ /*
+ * page is already detached from jnode and mapping
+ */
+ assert("vs-1086", page->mapping == NULL);
+ assert("nikita-2690",
+ (!PagePrivate(page) && jprivate(page) == 0));
+ /*
+ * waiting for writeback completion with page lock held is
+ * perfectly valid
+ */
+ wait_on_page_writeback(page);
+ reiser4_drop_page(page);
+ /*
+ * release reference taken by read_cache_page() above
+ */
+ put_page(page);
+
+ drop_exclusive_access(uf_info);
+ /*
+ * throttle the conversion
+ */
+ reiser4_throttle_write(inode);
+ get_exclusive_access(uf_info);
+ /*
+ * nobody is allowed to complete conversion but a process which
+ * started it
+ */
+ assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
+ }
+ reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
+
+ if (i == num_pages) {
+ /*
+ * file is converted to formatting items
+ */
+ assert("vs-1698", reiser4_inode_get_flag(inode,
+ REISER4_PART_MIXED));
+ assert("vs-1260",
+ inode_has_no_jnodes(reiser4_inode_data(inode)));
+ uf_info->container = UF_CONTAINER_TAILS;
+ complete_conversion(inode);
+ return 0;
+ }
+ /*
+ * conversion is not complete. Inode was already marked as
+ * REISER4_PART_MIXED and stat-data were updated at the first
+ * iteration of the loop above.
+ */
+ warning("nikita-2282",
+ "Partial conversion of %llu: %lu of %lu: %i",
+ (unsigned long long)get_inode_oid(inode), i,
+ num_pages, result);
+ /*
+ * this flag should be cleared, otherwise get_exclusive_access_careful()
+ * will fall into infinite loop
+ */
+ assert("edward-1550", !reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV));
+ return result;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file_ops.c linux-5.10.2/fs/reiser4/plugin/file_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file_ops.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,120 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* this file contains typical implementations for some of methods of
+ struct file_operations and of struct address_space_operations
+*/
+
+#include "../inode.h"
+#include "object.h"
+
+/* file operations */
+
+/* implementation of vfs's llseek method of struct file_operations for
+ typical directory can be found in file_ops_readdir.c
+*/
+loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
+
+/* implementation of vfs's iterate method of struct file_operations for
+ typical directory can be found in file_ops_readdir.c
+*/
+int reiser4_iterate_common(struct file *, struct dir_context *);
+
+/**
+ * reiser4_release_dir_common - release of struct file_operations
+ * @inode: inode of released file
+ * @file: file to release
+ *
+ * Implementation of release method of struct file_operations for typical
+ * directory. All it does is freeing of reiser4 specific file data.
+*/
+int reiser4_release_dir_common(struct inode *inode, struct file *file)
+{
+ reiser4_context *ctx;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ reiser4_free_file_fsdata(file);
+ reiser4_exit_context(ctx);
+ return 0;
+}
+
+/* this is common implementation of vfs's fsync method of struct
+ file_operations
+*/
+int reiser4_sync_common(struct file *file, loff_t start,
+ loff_t end, int datasync)
+{
+ reiser4_context *ctx;
+ int result;
+ struct dentry *dentry = file->f_path.dentry;
+
+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
+
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/*
+ * common sync method for regular files.
+ *
+ * We are trying to be smart here. Instead of committing all atoms (original
+ * solution), we scan dirty pages of this file and commit all atoms they are
+ * part of.
+ *
+ * Situation is complicated by anonymous pages: i.e., extent-less pages
+ * dirtied through mmap. Fortunately sys_fsync() first calls
+ * filemap_fdatawrite() that will ultimately call reiser4_writepages_dispatch,
+ * insert all missing extents and capture anonymous pages.
+ */
+int reiser4_sync_file_common(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ int ret;
+ reiser4_context *ctx;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = file->f_mapping->host;
+
+ int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ ctx = reiser4_init_context(dentry->d_inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ inode_lock(inode);
+ ret = reserve_update_sd_common(inode);
+ if (ret) {
+ reiser4_exit_context(ctx);
+ inode_unlock(inode);
+ return RETERR(-ENOSPC);
+ }
+ write_sd_by_inode_common(dentry->d_inode, NULL);
+ ret = force_commit_current_atom();
+ if (ret)
+ warning("", "Failed to sync file %s", dentry->d_name.name);
+ reiser4_exit_context(ctx);
+ inode_unlock(inode);
+ return ret;
+}
+
+long reiser4_ioctl_dir_common(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return reiser4_ioctl_volume(file, cmd, arg, reiser4_volume_op_dir);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file_ops_readdir.c linux-5.10.2/fs/reiser4/plugin/file_ops_readdir.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file_ops_readdir.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file_ops_readdir.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,660 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include <linux/iversion.h>
+#include "../inode.h"
+
+/* return true, iff @coord points to the valid directory item that is part of
+ * @inode directory. */
+static int is_valid_dir_coord(struct inode *inode, coord_t *coord)
+{
+ return plugin_of_group(item_plugin_by_coord(coord),
+ DIR_ENTRY_ITEM_TYPE) &&
+ inode_file_plugin(inode)->owns_item(inode, coord);
+}
+
+/* compare two logical positions within the same directory */
+static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2)
+{
+ cmp_t result;
+
+ assert("nikita-2534", p1 != NULL);
+ assert("nikita-2535", p2 != NULL);
+
+ result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
+ if (result == EQUAL_TO) {
+ int diff;
+
+ diff = p1->pos - p2->pos;
+ result =
+ (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
+ }
+ return result;
+}
+
+/* see comment before reiser4_readdir_common() for overview of why "adjustment"
+ * is necessary. */
+static void
+adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot,
+ const struct dir_pos *mod_point, int adj)
+{
+ struct dir_pos *pos;
+
+ /*
+ * new directory entry was added (adj == +1) or removed (adj == -1) at
+ * the @mod_point. Directory file descriptor @dir is doing readdir and
+ * is currently positioned at @readdir_spot. Latter has to be updated
+ * to maintain stable readdir.
+ */
+ /* directory is positioned to the beginning. */
+ if (readdir_spot->entry_no == 0)
+ return;
+
+ pos = &readdir_spot->position;
+ switch (dir_pos_cmp(mod_point, pos)) {
+ case LESS_THAN:
+ /* @mod_pos is _before_ @readdir_spot, that is, entry was
+ * added/removed on the left (in key order) of current
+ * position. */
+ /* logical number of directory entry readdir is "looking" at
+ * changes */
+ readdir_spot->entry_no += adj;
+ assert("nikita-2577",
+ ergo(dir != NULL,
+ reiser4_get_dir_fpos(dir, dir->f_pos) + adj >= 0));
+ if (de_id_cmp(&pos->dir_entry_key,
+ &mod_point->dir_entry_key) == EQUAL_TO) {
+ assert("nikita-2575", mod_point->pos < pos->pos);
+ /*
+ * if entry added/removed has the same key as current
+ * for readdir, update counter of duplicate keys in
+ * @readdir_spot.
+ */
+ pos->pos += adj;
+ }
+ break;
+ case GREATER_THAN:
+ /* directory is modified after @pos: nothing to do. */
+ break;
+ case EQUAL_TO:
+ /* cannot insert an entry readdir is looking at, because it
+ already exists. */
+ assert("nikita-2576", adj < 0);
+ /* directory entry to which @pos points to is being
+ removed.
+
+ NOTE-NIKITA: Right thing to do is to update @pos to point
+ to the next entry. This is complex (we are under spin-lock
+ for one thing). Just rewind it to the beginning. Next
+ readdir will have to scan the beginning of
+ directory. Proper solution is to use semaphore in
+ spin lock's stead and use rewind_right() here.
+
+ NOTE-NIKITA: now, semaphore is used, so...
+ */
+ memset(readdir_spot, 0, sizeof *readdir_spot);
+ }
+}
+
+/* scan all file-descriptors for this directory and adjust their
+ positions respectively. Should be used by implementations of
+ add_entry and rem_entry of dir plugin */
+void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
+ int offset, int adj)
+{
+ reiser4_file_fsdata *scan;
+ struct dir_pos mod_point;
+
+ assert("nikita-2536", dir != NULL);
+ assert("nikita-2538", de != NULL);
+ assert("nikita-2539", adj != 0);
+
+ build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
+ mod_point.pos = offset;
+
+ spin_lock_inode(dir);
+
+ /*
+ * new entry was added/removed in directory @dir. Scan all file
+ * descriptors for @dir that are currently involved into @readdir and
+ * update them.
+ */
+
+ list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
+ adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
+
+ spin_unlock_inode(dir);
+}
+
+/*
+ * traverse tree to start/continue readdir from the readdir position @pos.
+ */
+static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap)
+{
+ reiser4_key key;
+ int result;
+ struct inode *inode;
+
+ assert("nikita-2554", pos != NULL);
+
+ inode = file_inode(dir);
+ result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
+ if (result != 0)
+ return result;
+ result = reiser4_object_lookup(meta_subvol_tree(),
+ inode,
+ &key,
+ tap->coord,
+ tap->lh,
+ tap->mode,
+ FIND_EXACT,
+ LEAF_LEVEL, LEAF_LEVEL,
+ 0, &tap->ra_info);
+ if (result == CBK_COORD_FOUND)
+ result = rewind_right(tap, (int)pos->position.pos);
+ else {
+ tap->coord->node = NULL;
+ done_lh(tap->lh);
+ result = RETERR(-EIO);
+ }
+ return result;
+}
+
+/*
+ * handling of non-unique keys: calculate at what ordinal position within
+ * sequence of directory items with identical keys @pos is.
+ */
+static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap)
+{
+ int result;
+ coord_t coord;
+ lock_handle lh;
+ tap_t scan;
+ de_id *did;
+ reiser4_key de_key;
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+ reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
+ reiser4_tap_copy(&scan, tap);
+ reiser4_tap_load(&scan);
+ pos->position.pos = 0;
+
+ did = &pos->position.dir_entry_key;
+
+ if (is_valid_dir_coord(inode, scan.coord)) {
+
+ build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
+
+ while (1) {
+
+ result = go_prev_unit(&scan);
+ if (result != 0)
+ break;
+
+ if (!is_valid_dir_coord(inode, scan.coord)) {
+ result = -EINVAL;
+ break;
+ }
+
+ /* get key of directory entry */
+ unit_key_by_coord(scan.coord, &de_key);
+ if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
+ /* duplicate-sequence is over */
+ break;
+ }
+ pos->position.pos++;
+ }
+ } else
+ result = RETERR(-ENOENT);
+ reiser4_tap_relse(&scan);
+ reiser4_tap_done(&scan);
+ return result;
+}
+
+/*
+ * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
+ */
+static int dir_rewind(struct file *dir, loff_t *fpos, struct readdir_pos *pos, tap_t *tap)
+{
+ __u64 destination;
+ __s64 shift;
+ int result;
+ struct inode *inode;
+ loff_t dirpos;
+
+ assert("nikita-2553", dir != NULL);
+ assert("nikita-2548", pos != NULL);
+ assert("nikita-2551", tap->coord != NULL);
+ assert("nikita-2552", tap->lh != NULL);
+
+ dirpos = reiser4_get_dir_fpos(dir, *fpos);
+ shift = dirpos - pos->fpos;
+ /* this is logical directory entry within @dir which we are rewinding
+ * to */
+ destination = pos->entry_no + shift;
+
+ inode = file_inode(dir);
+ if (dirpos < 0)
+ return RETERR(-EINVAL);
+ else if (destination == 0ll || dirpos == 0) {
+ /* rewind to the beginning of directory */
+ memset(pos, 0, sizeof *pos);
+ return dir_go_to(dir, pos, tap);
+ } else if (destination >= inode->i_size)
+ return RETERR(-ENOENT);
+
+ if (shift < 0) {
+ /* I am afraid of negative numbers */
+ shift = -shift;
+ /* rewinding to the left */
+ if (shift <= (int)pos->position.pos) {
+ /* destination is within sequence of entries with
+ duplicate keys. */
+ result = dir_go_to(dir, pos, tap);
+ } else {
+ shift -= pos->position.pos;
+ while (1) {
+ /* repetitions: deadlock is possible when
+ going to the left. */
+ result = dir_go_to(dir, pos, tap);
+ if (result == 0) {
+ result = rewind_left(tap, shift);
+ if (result == -E_DEADLOCK) {
+ reiser4_tap_done(tap);
+ continue;
+ }
+ }
+ break;
+ }
+ }
+ } else {
+ /* rewinding to the right */
+ result = dir_go_to(dir, pos, tap);
+ if (result == 0)
+ result = rewind_right(tap, shift);
+ }
+ if (result == 0) {
+ result = set_pos(inode, pos, tap);
+ if (result == 0) {
+ /* update pos->position.pos */
+ pos->entry_no = destination;
+ pos->fpos = dirpos;
+ }
+ }
+ return result;
+}
+
+/*
+ * Function that is called by common_readdir() on each directory entry while
+ * doing readdir. ->filldir callback may block, so we had to release long term
+ * lock while calling it. To avoid repeating tree traversal, seal is used. If
+ * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
+ *
+ * Whether node is unlocked in case of any other error is undefined. It is
+ * guaranteed to be still locked if success (0) is returned.
+ *
+ * When ->filldir() wants no more, feed_entry() returns 1, and node is
+ * unlocked.
+ */
+static int feed_entry(tap_t *tap, reiser4_tree *tree,
+ struct dir_context *context)
+{
+ item_plugin *iplug;
+ char *name;
+ reiser4_key sd_key;
+ int result;
+ char buf[DE_NAME_BUF_LEN];
+ char name_buf[32];
+ char *local_name;
+ unsigned file_type;
+ seal_t seal;
+ coord_t *coord;
+ reiser4_key entry_key;
+
+ coord = tap->coord;
+ iplug = item_plugin_by_coord(coord);
+
+ /* pointer to name within the node */
+ name = iplug->s.dir.extract_name(coord, buf);
+ assert("nikita-1371", name != NULL);
+
+ /* key of object the entry points to */
+ if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
+ return RETERR(-EIO);
+
+ /* we must release longterm znode lock before calling filldir to avoid
+ deadlock which may happen if filldir causes page fault. So, copy
+ name to intermediate buffer */
+ if (strlen(name) + 1 > sizeof(name_buf)) {
+ local_name = kmalloc(strlen(name) + 1,
+ reiser4_ctx_gfp_mask_get());
+ if (local_name == NULL)
+ return RETERR(-ENOMEM);
+ } else
+ local_name = name_buf;
+
+ strcpy(local_name, name);
+ file_type = iplug->s.dir.extract_file_type(coord);
+
+ unit_key_by_coord(coord, &entry_key);
+ reiser4_seal_init(&seal, coord, &entry_key);
+
+ longterm_unlock_znode(tap->lh);
+
+ /*
+ * send information about directory entry to the ->filldir() filler
+ * supplied to us by caller (VFS).
+ *
+ * ->filldir is entitled to do weird things. For example, ->filldir
+ * supplied by knfsd re-enters file system. Make sure no locks are
+ * held.
+ */
+ assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
+
+ reiser4_txn_restart_current();
+ if (!dir_emit(context, name, (int)strlen(name),
+ /* inode number of object bounden by this entry */
+ oid_to_uino(get_key_objectid(&sd_key)), file_type))
+ /* ->filldir() is satisfied. (no space in buffer, IOW) */
+ result = 1;
+ else
+ result = reiser4_seal_validate(&seal, tree, coord, &entry_key,
+ tap->lh, tap->mode,
+ ZNODE_LOCK_HIPRI);
+
+ if (local_name != name_buf)
+ kfree(local_name);
+
+ return result;
+}
+
+static void move_entry(struct readdir_pos *pos, coord_t *coord)
+{
+ reiser4_key de_key;
+ de_id *did;
+
+ /* update @pos */
+ ++pos->entry_no;
+ did = &pos->position.dir_entry_key;
+
+ /* get key of directory entry */
+ unit_key_by_coord(coord, &de_key);
+
+ if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
+ /* we are within sequence of directory entries
+ with duplicate keys. */
+ ++pos->position.pos;
+ else {
+ pos->position.pos = 0;
+ build_de_id_by_key(&de_key, did);
+ }
+ ++pos->fpos;
+}
+
+/*
+ * STATELESS READDIR
+ *
+ * readdir support in reiser4 relies on ability to update readdir_pos embedded
+ * into reiser4_file_fsdata on each directory modification (name insertion and
+ * removal), see reiser4_readdir_common() function below. This obviously doesn't
+ * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
+ * across client READDIR requests for the same directory.
+ *
+ * To address this we maintain a "pool" of detached reiser4_file_fsdata
+ * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
+ * find detached reiser4_file_fsdata corresponding to previous readdir
+ * request. In other words, additional state is maintained on the
+ * server. (This is somewhat contrary to the design goals of NFS protocol.)
+ *
+ * To efficiently detect when our ->readdir() method is called by NFS server,
+ * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
+ * file_is_stateless() function).
+ *
+ * To find out d_cursor in the pool, we encode client id (cid) in the highest
+ * bits of NFS readdir cookie: when first readdir request comes to the given
+ * directory from the given client, cookie is set to 0. This situation is
+ * detected, global cid_counter is incremented, and stored in highest bits of
+ * all direntry offsets returned to the client, including last one. As the
+ * only valid readdir cookie is one obtained as direntry->offset, we are
+ * guaranteed that next readdir request (continuing current one) will have
+ * current cid in the highest bits of starting readdir cookie. All d_cursors
+ * are hashed into per-super-block hash table by (oid, cid) key.
+ *
+ * In addition d_cursors are placed into per-super-block radix tree where they
+ * are keyed by oid alone. This is necessary to efficiently remove them during
+ * rmdir.
+ *
+ * At last, currently unused d_cursors are linked into special list. This list
+ * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
+ *
+ */
+
+/*
+ * prepare for readdir.
+ *
+ * NOTE: @f->f_pos may be out-of-date (iterate() vs readdir()).
+ * @fpos is effective position.
+ */
+static int dir_readdir_init(struct file *f, loff_t* fpos, tap_t *tap,
+ struct readdir_pos **pos)
+{
+ struct inode *inode;
+ reiser4_file_fsdata *fsdata;
+ int result;
+
+ assert("nikita-1359", f != NULL);
+ inode = file_inode(f);
+ assert("nikita-1360", inode != NULL);
+
+ if (!S_ISDIR(inode->i_mode))
+ return RETERR(-ENOTDIR);
+
+ /* try to find detached readdir state */
+ result = reiser4_attach_fsdata(f, fpos, inode);
+ if (result != 0)
+ return result;
+
+ fsdata = reiser4_get_file_fsdata(f);
+ assert("nikita-2571", fsdata != NULL);
+ if (IS_ERR(fsdata))
+ return PTR_ERR(fsdata);
+
+ /* add file descriptor to the readdir list hanging of directory
+ * inode. This list is used to scan "readdirs-in-progress" while
+ * inserting or removing names in the directory. */
+ spin_lock_inode(inode);
+ if (list_empty_careful(&fsdata->dir.linkage))
+ list_add(&fsdata->dir.linkage, get_readdir_list(inode));
+ *pos = &fsdata->dir.readdir;
+ spin_unlock_inode(inode);
+
+ /* move @tap to the current position */
+ return dir_rewind(f, fpos, *pos, tap);
+}
+
+/* this is implementation of vfs's llseek method of struct file_operations for
+ typical directory
+ See comment before reiser4_iterate_common() for explanation.
+*/
+loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin)
+{
+ reiser4_context *ctx;
+ loff_t result;
+ struct inode *inode;
+
+ inode = file_inode(file);
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ inode_lock(inode);
+
+ /* update ->f_pos */
+ result = default_llseek_unlocked(file, off, origin);
+ if (result >= 0) {
+ int ff;
+ coord_t coord;
+ lock_handle lh;
+ tap_t tap;
+ struct readdir_pos *pos;
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
+
+ ff = dir_readdir_init(file, &file->f_pos, &tap, &pos);
+ reiser4_detach_fsdata(file);
+ if (ff != 0)
+ result = (loff_t) ff;
+ reiser4_tap_done(&tap);
+ }
+ reiser4_detach_fsdata(file);
+ inode_unlock(inode);
+
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/* this is common implementation of vfs's readdir method of struct
+ file_operations
+
+ readdir problems:
+
+ readdir(2)/getdents(2) interface is based on implicit assumption that
+ readdir can be restarted from any particular point by supplying file system
+ with off_t-full of data. That is, file system fills ->d_off field in struct
+ dirent and later user passes ->d_off to the seekdir(3), which is, actually,
+ implemented by glibc as lseek(2) on directory.
+
+ Reiser4 cannot restart readdir from 64 bits of data, because two last
+ components of the key of directory entry are unknown, which given 128 bits:
+ locality and type fields in the key of directory entry are always known, to
+ start readdir() from given point objectid and offset fields have to be
+ filled.
+
+ Traditional UNIX API for scanning through directory
+ (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
+ assumption that directory is structured very much like regular file, in
+ particular, it is implied that each name within given directory (directory
+ entry) can be uniquely identified by scalar offset and that such offset is
+ stable across the life-time of the name is identifies.
+
+ This is manifestly not so for reiser4. In reiser4 the only stable unique
+ identifies for the directory entry is its key that doesn't fit into
+ seekdir/telldir API.
+
+ solution:
+
+ Within each file descriptor participating in readdir-ing of directory
+ plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
+ the "current" directory entry that file descriptor looks at. It contains a
+ key of directory entry (plus some additional info to deal with non-unique
+ keys that we wouldn't dwell onto here) and a logical position of this
+ directory entry starting from the beginning of the directory, that is
+ ordinal number of this entry in the readdir order.
+
+ Obviously this logical position is not stable in the face of directory
+ modifications. To work around this, on each addition or removal of directory
+ entry all file descriptors for directory inode are scanned and their
+ readdir_pos are updated accordingly (adjust_dir_pos()).
+*/
+int reiser4_iterate_common(struct file *f /* directory file being read */,
+ struct dir_context *context /* callback data passed to us by VFS */)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *inode;
+ coord_t coord;
+ lock_handle lh;
+ tap_t tap;
+ struct readdir_pos *pos;
+
+ assert("nikita-1359", f != NULL);
+ inode = file_inode(f);
+ assert("nikita-1360", inode != NULL);
+
+ if (!S_ISDIR(inode->i_mode))
+ return RETERR(-ENOTDIR);
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
+
+ reiser4_readdir_readahead_init(inode, &tap);
+
+repeat:
+ result = dir_readdir_init(f, &context->pos, &tap, &pos);
+ if (result == 0) {
+ result = reiser4_tap_load(&tap);
+ /* scan entries one by one feeding them to @filld */
+ while (result == 0) {
+ coord_t *coord;
+
+ coord = tap.coord;
+ assert("nikita-2572", coord_is_existing_unit(coord));
+ assert("nikita-3227", is_valid_dir_coord(inode, coord));
+
+ result = feed_entry(&tap, meta_subvol_tree(), context);
+ if (result > 0) {
+ break;
+ } else if (result == 0) {
+ ++context->pos;
+ result = go_next_unit(&tap);
+ if (result == -E_NO_NEIGHBOR ||
+ result == -ENOENT) {
+ result = 0;
+ break;
+ } else if (result == 0) {
+ if (is_valid_dir_coord(inode, coord))
+ move_entry(pos, coord);
+ else
+ break;
+ }
+ } else if (result == -E_REPEAT) {
+ /* feed_entry() had to restart. */
+ ++context->pos;
+ reiser4_tap_relse(&tap);
+ goto repeat;
+ } else
+ warning("vs-1617",
+ "reiser4_readdir_common: unexpected error %d",
+ result);
+ }
+ reiser4_tap_relse(&tap);
+
+ if (result >= 0)
+ f->f_version = inode_query_iversion(inode);
+ } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
+ result = 0;
+ reiser4_tap_done(&tap);
+ reiser4_detach_fsdata(f);
+ /*
+ * try to update directory's atime
+ */
+ if (reserve_update_sd_common(inode) != 0)
+ warning("", "failed to update atime on readdir: %llu",
+ get_inode_oid(inode));
+ else
+ file_accessed(f);
+
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+
+ return (result <= 0) ? result : 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/file_plugin_common.c linux-5.10.2/fs/reiser4/plugin/file_plugin_common.c
--- linux-5.10.2.orig/fs/reiser4/plugin/file_plugin_common.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/file_plugin_common.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,1076 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ reiser4/README */
+
+/* this file contains typical implementations for most of methods of
+ file plugin
+*/
+
+#include "../inode.h"
+#include "object.h"
+#include "../safe_link.h"
+
+static int insert_new_sd(struct inode *inode, oid_t oid);
+static int update_sd(struct inode *inode);
+
+void build_body_key_common(struct inode *inode, reiser4_key *key)
+{
+ reiser4_key_init(key);
+ set_key_locality(key, reiser4_inode_data(inode)->locality_id);
+ set_key_objectid(key, get_inode_oid(inode));
+ set_key_type(key, KEY_BODY_MINOR);
+}
+
+/**
+ * Common implementation of ->write_sd_by_inode() of file plugins.
+ * Either insert stat-data or update it.
+ * @inode: object to write stat-data of
+ */
+int write_sd_by_inode_common(struct inode *inode, oid_t *oid)
+{
+ int result;
+
+ assert("nikita-730", inode != NULL);
+
+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
+ /*
+ * object doesn't have stat-data yet
+ */
+ assert("edward-1785", oid != NULL);
+ result = insert_new_sd(inode, *oid);
+ }
+ else
+ assert("edward-1786", oid == NULL);
+ result = update_sd(inode);
+
+ if (result != 0 &&
+ result != -ENAMETOOLONG &&
+ result != -ENOMEM)
+ /*
+ * Don't issue warnings about "name is too long"
+ */
+ warning("nikita-2221",
+ "Failed to save sd for %llu: %i",
+ (unsigned long long)get_inode_oid(inode),
+ result);
+ return result;
+}
+
+/* this is common implementation of set_plug_in_inode method of file plugin
+ */
+int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
+ struct inode *parent /* parent object */ ,
+ reiser4_object_create_data * data /* creational
+ * data */ )
+{
+ __u64 mask;
+
+ object->i_mode = data->mode;
+ /* this should be plugin decision */
+ object->i_uid = current_fsuid();
+ object->i_mtime = object->i_atime = object->i_ctime = current_time(object);
+
+ /* support for BSD style group-id assignment. See mount's manual page
+ description of bsdgroups ext2 mount options for more details */
+ if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
+ object->i_gid = parent->i_gid;
+ else if (parent->i_mode & S_ISGID) {
+ /* parent directory has sguid bit */
+ object->i_gid = parent->i_gid;
+ if (S_ISDIR(object->i_mode))
+ /* sguid is inherited by sub-directories */
+ object->i_mode |= S_ISGID;
+ } else
+ object->i_gid = current_fsgid();
+
+ /* this object doesn't have stat-data yet */
+ reiser4_inode_set_flag(object, REISER4_NO_SD);
+#if 0
+ /* this is now called after all inode plugins are initialized:
+ do_create_vfs_child after adjust_to_parent */
+ /* setup inode and file-operations for this inode */
+ setup_inode_ops(object, data);
+#endif
+ reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
+ mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
+ if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
+ mask |= (1 << LARGE_TIMES_STAT);
+
+ reiser4_inode_data(object)->extmask = mask;
+ return 0;
+}
+
+/* this is common implementation of adjust_to_parent method of file plugin for
+ regular files
+ */
+int adjust_to_parent_common(struct inode *object /* new object */ ,
+ struct inode *parent /* parent directory */ ,
+ struct inode *root/* root directory */)
+{
+ assert("nikita-2165", object != NULL);
+ if (parent == NULL)
+ parent = root;
+ assert("nikita-2069", parent != NULL);
+
+ /*
+ * inherit missing plugins from parent
+ */
+
+ grab_plugin_pset(object, parent, PSET_FILE);
+ grab_plugin_pset(object, parent, PSET_SD);
+ grab_plugin_pset(object, parent, PSET_FORMATTING);
+ grab_plugin_pset(object, parent, PSET_PERM);
+ return 0;
+}
+
+/* this is common implementation of adjust_to_parent method of file plugin for
+ typical directories
+ */
+int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
+ struct inode *parent /* parent directory */ ,
+ struct inode *root/* root directory */)
+{
+ int result = 0;
+ pset_member memb;
+
+ assert("nikita-2166", object != NULL);
+ if (parent == NULL)
+ parent = root;
+ assert("nikita-2167", parent != NULL);
+
+ /*
+ * inherit missing plugins from parent
+ */
+ for (memb = 0; memb < PSET_LAST; ++memb) {
+ result = grab_plugin_pset(object, parent, memb);
+ if (result != 0)
+ break;
+ }
+ return result;
+}
+
+int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
+ struct inode *parent /* parent directory */,
+ struct inode *root/* root directory */)
+{
+ int result;
+ result = adjust_to_parent_common(object, parent, root);
+ if (result)
+ return result;
+ assert("edward-1416", parent != NULL);
+
+ grab_plugin_pset(object, parent, PSET_CLUSTER);
+ grab_plugin_pset(object, parent, PSET_CIPHER);
+ grab_plugin_pset(object, parent, PSET_DIGEST);
+ grab_plugin_pset(object, parent, PSET_COMPRESSION);
+ grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
+
+ return 0;
+}
+
+/*
+ * this is common implementation of ->create_object() of file plugins
+ */
+int reiser4_create_object_common(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *data, oid_t *oid)
+{
+ assert("nikita-744", object != NULL);
+ assert("nikita-745", parent != NULL);
+ assert("nikita-747", data != NULL);
+ assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
+
+ return write_sd_by_inode_common(object, oid);
+}
+
+/**
+ * Reserve disk space to update stat-data item
+ */
+int reserve_update_sd_common(struct inode *inode)
+{
+ reiser4_block_nr amount;
+
+ assert("vs-1249",
+ inode_file_plugin(inode)->estimate.update ==
+ estimate_update_common);
+
+ amount = inode_file_plugin(inode)->estimate.update(inode);
+
+ return reiser4_grab_space_force(amount, BA_CAN_COMMIT,
+ get_meta_subvol());
+}
+
+/**
+ * grab space which is needed to remove 2 items from the tree:
+ * stat data and safe-link
+ * @inode: object to be deleted
+ */
+static int reserve_delete_object(struct inode *inode)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+
+ return reiser4_grab_space_force(2 *
+ estimate_one_item_removal(&subv->tree),
+ BA_RESERVED | BA_CAN_COMMIT, subv);
+}
+
+static int common_object_delete_no_reserve(struct inode *inode);
+
+/**
+ * reiser4_delete_object_common - delete_object of file_plugin
+ * @inode: inode to be deleted
+ *
+ * Common implementation of ->delete_object() of file_plugin.
+ * It applies to object its deletion consists of removing two items - stat data
+ * and safe-link.
+ */
+int reiser4_delete_object_common(struct inode *inode)
+{
+ int ret;
+
+ assert("nikita-1477", inode != NULL);
+ /*
+ * FIXME: if file body deletion failed (i/o error, for instance),
+ * inode->i_size can be != 0 here
+ */
+ assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
+ assert("nikita-3421", inode->i_nlink == 0);
+
+ if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
+ return 0;
+ ret = reserve_delete_object(inode);
+ if (ret)
+ return ret;
+ return common_object_delete_no_reserve(inode);
+}
+
+/**
+ * reiser4_delete_dir_common - delete_object of file_plugin
+ * @inode: inode to be deleted
+ *
+ * This is common implementation of delete_object method of file_plugin for
+ * typical directory. It calls done method of dir_plugin to remove "." and
+ * removes stat data and safe-link.
+ */
+int reiser4_delete_dir_common(struct inode *inode)
+{
+ int result;
+ dir_plugin *dplug;
+
+ assert("", (get_current_context() &&
+ get_current_context()->trans->atom == NULL));
+
+ dplug = inode_dir_plugin(inode);
+ assert("vs-1101", dplug && dplug->done);
+ /*
+ * kill cursors which might be attached to inode
+ */
+ reiser4_kill_cursors(inode);
+ result = reserve_delete_object(inode);
+ if (result)
+ return result;
+ result = dplug->done(inode);
+ if (result)
+ return result;
+ return common_object_delete_no_reserve(inode);
+}
+
+/* this is common implementation of add_link method of file plugin
+ */
+int reiser4_add_link_common(struct inode *object, struct inode *parent)
+{
+ /*
+ * increment ->i_nlink and update ->i_ctime
+ */
+
+ INODE_INC_NLINK(object);
+ object->i_ctime = current_time(object);
+ return 0;
+}
+
+/* this is common implementation of rem_link method of file plugin
+ */
+int reiser4_rem_link_common(struct inode *object, struct inode *parent)
+{
+ assert("nikita-2021", object != NULL);
+ assert("nikita-2163", object->i_nlink > 0);
+
+ /*
+ * decrement ->i_nlink and update ->i_ctime
+ */
+
+ INODE_DROP_NLINK(object);
+ object->i_ctime = current_time(object);
+ return 0;
+}
+
+/* this is common implementation of rem_link method of file plugin for typical
+ directory
+*/
+int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
+{
+ assert("nikita-20211", object != NULL);
+ assert("nikita-21631", object->i_nlink > 0);
+
+ /*
+ * decrement ->i_nlink and update ->i_ctime
+ */
+ if(object->i_nlink == 2)
+ INODE_SET_NLINK(object, 0);
+
+ else
+ INODE_DROP_NLINK(object);
+ object->i_ctime = current_time(object);
+ return 0;
+}
+
+/* this is common implementation of owns_item method of file plugin
+ compare objectids of keys in inode and coord */
+int owns_item_common(const struct inode *inode, /* object to check
+ * against */
+ const coord_t *coord/* coord to check */)
+{
+ reiser4_key item_key;
+ reiser4_key file_key;
+
+ assert("nikita-760", inode != NULL);
+ assert("nikita-761", coord != NULL);
+
+ return coord_is_existing_item(coord) &&
+ (get_key_objectid(build_sd_key(inode, &file_key)) ==
+ get_key_objectid(item_key_by_coord(coord, &item_key)));
+}
+
+/* this is common implementation of owns_item method of file plugin
+ for typical directory
+*/
+int owns_item_common_dir(const struct inode *inode,/* object to check against */
+ const coord_t *coord/* coord of item to check */)
+{
+ reiser4_key item_key;
+
+ assert("nikita-1335", inode != NULL);
+ assert("nikita-1334", coord != NULL);
+
+ if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
+ return get_key_locality(item_key_by_coord(coord, &item_key)) ==
+ get_inode_oid(inode);
+ else
+ return owns_item_common(inode, coord);
+}
+
+/* this is common implementation of can_add_link method of file plugin
+ checks whether yet another hard links to this object can be added
+*/
+int can_add_link_common(const struct inode *object/* object to check */)
+{
+ assert("nikita-732", object != NULL);
+
+ /* inode->i_nlink is unsigned int, so just check for integer
+ overflow */
+ return object->i_nlink + 1 != 0;
+}
+
+/* this is common implementation of can_rem_link method of file plugin for
+ typical directory
+*/
+int can_rem_link_common_dir(const struct inode *inode)
+{
+ /* is_dir_empty() returns 0 is dir is empty */
+ return !is_dir_empty(inode);
+}
+
+/* this is common implementation of detach method of file plugin for typical
+ directory
+*/
+int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
+{
+ dir_plugin *dplug;
+
+ dplug = inode_dir_plugin(child);
+ assert("nikita-2883", dplug != NULL);
+ assert("nikita-2884", dplug->detach != NULL);
+ return dplug->detach(child, parent);
+}
+
+/* this is common implementation of bind method of file plugin for typical
+ directory
+*/
+int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
+{
+ dir_plugin *dplug;
+
+ dplug = inode_dir_plugin(child);
+ assert("nikita-2646", dplug != NULL);
+ return dplug->attach(child, parent);
+}
+
+static int process_truncate(struct inode *, __u64 size);
+
+/* this is common implementation of safelink method of file plugin
+ */
+int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
+{
+ int result;
+
+ assert("vs-1705", get_current_context()->trans->atom == NULL);
+ if (link == SAFE_UNLINK)
+ /* nothing to do. iput() in the caller (process_safelink) will
+ * finish with file */
+ result = 0;
+ else if (link == SAFE_TRUNCATE)
+ result = process_truncate(object, value);
+ else {
+ warning("nikita-3438", "Unrecognized safe-link type: %i", link);
+ result = RETERR(-EIO);
+ }
+ return result;
+}
+
+/* this is common implementation of estimate.create method of file plugin
+ can be used when object creation involves insertion of one item (usually stat
+ data) into tree
+*/
+reiser4_block_nr estimate_create_common(const struct inode *object)
+{
+ return estimate_one_insert_item(meta_subvol_tree());
+}
+
+/* this is common implementation of estimate.create method of file plugin for
+ typical directory
+ can be used when directory creation involves insertion of two items (usually
+ stat data and item containing "." and "..") into tree
+*/
+reiser4_block_nr estimate_create_common_dir(const struct inode *object)
+{
+ return 2 * estimate_one_insert_item(meta_subvol_tree());
+}
+
+/* this is common implementation of estimate.update method of file plugin
+ can be used when stat data update does not do more than inserting a unit
+ into a stat data item which is probably true for most cases
+*/
+reiser4_block_nr estimate_update_common(const struct inode *inode)
+{
+ return estimate_one_insert_into_item(meta_subvol_tree());
+}
+
+/* this is common implementation of estimate.unlink method of file plugin
+ */
+reiser4_block_nr
+estimate_unlink_common(const struct inode *object UNUSED_ARG,
+ const struct inode *parent UNUSED_ARG)
+{
+ return 0;
+}
+
+/* this is common implementation of estimate.unlink method of file plugin for
+ typical directory
+*/
+reiser4_block_nr
+estimate_unlink_common_dir(const struct inode *object,
+ const struct inode *parent)
+{
+ dir_plugin *dplug;
+
+ dplug = inode_dir_plugin(object);
+ assert("nikita-2888", dplug != NULL);
+ assert("nikita-2887", dplug->estimate.unlink != NULL);
+ return dplug->estimate.unlink(object, parent);
+}
+
+char *wire_write_common(struct inode *inode, char *start)
+{
+ return build_inode_onwire(inode, start);
+}
+
+char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
+{
+ if (!obj)
+ return locate_obj_key_id_onwire(addr);
+ return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
+}
+
+struct dentry *wire_get_common(struct super_block *sb,
+ reiser4_object_on_wire * obj)
+{
+ struct inode *inode;
+ struct dentry *dentry;
+ reiser4_key key;
+
+ extract_key_from_id(&obj->u.std.key_id, &key);
+ inode = reiser4_iget(sb, &key, FIND_EXACT, 1);
+ if (!IS_ERR(inode)) {
+ reiser4_iget_complete(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &get_super_private(sb)->ops.dentry;
+ } else if (PTR_ERR(inode) == -ENOENT)
+ /*
+ * inode wasn't found at the key encoded in the file
+ * handle. Hence, file handle is stale.
+ */
+ dentry = ERR_PTR(RETERR(-ESTALE));
+ else
+ dentry = (void *)inode;
+ return dentry;
+}
+
+int wire_size_common(struct inode *inode)
+{
+ return inode_onwire_size(inode);
+}
+
+void wire_done_common(reiser4_object_on_wire * obj)
+{
+ /* nothing to do */
+}
+
+/* helper function to print errors */
+static void key_warning(const reiser4_key * key /* key to print */ ,
+ const struct inode *inode,
+ int code/* error code to print */)
+{
+ assert("nikita-716", key != NULL);
+
+ if (code != -ENOMEM) {
+ warning("nikita-717", "Error for inode %llu (%i)",
+ (unsigned long long)get_key_objectid(key), code);
+ reiser4_print_key("for key", key);
+ }
+}
+
+/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
+#if REISER4_DEBUG
+static void
+check_inode_seal(const struct inode *inode,
+ const coord_t *coord, const reiser4_key * key)
+{
+ reiser4_key unit_key;
+
+ unit_key_by_coord(coord, &unit_key);
+ assert("nikita-2752",
+ WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
+ assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
+}
+
+static void check_sd_coord(coord_t *coord, const reiser4_key *key)
+{
+ coord_clear_iplug(coord);
+ if (zload(coord->node))
+ return;
+ if (!coord_is_existing_unit(coord) ||
+ !item_plugin_by_coord(coord) ||
+ (znode_get_level(coord->node) != LEAF_LEVEL) ||
+ !item_is_statdata(coord)) {
+ warning("nikita-1901", "Conspicuous seal");
+ reiser4_print_key("key", key);
+ print_coord("coord", coord, 1);
+ impossible("nikita-2877", "no way");
+ }
+ zrelse(coord->node);
+}
+#else
+#define check_inode_seal(inode, coord, key) noop
+#define check_sd_coord(coord, key) noop
+#endif
+
+/**
+ * insert new stat-data into tree. Called with inode state
+ * locked. Return inode state locked.
+ * @inode - inode to create stat-data for;
+ * @oid - pre-allocated object id.
+ */
+static int insert_new_sd(struct inode *inode, oid_t oid)
+{
+ int result;
+ reiser4_key key;
+ coord_t coord;
+ reiser4_item_data data;
+ char *area;
+ reiser4_inode *ref;
+ lock_handle lh;
+
+ assert("nikita-723", inode != NULL);
+ assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ ref = reiser4_inode_data(inode);
+ spin_lock_inode(inode);
+
+ if (ref->plugin_mask != 0)
+ /* inode has non-standard plugins */
+ inode_set_extension(inode, PLUGIN_STAT);
+ /*
+ * prepare specification of new item to be inserted
+ */
+
+ data.iplug = inode_sd_plugin(inode);
+ data.length = data.iplug->s.sd.save_len(inode);
+ spin_unlock_inode(inode);
+
+ data.data = NULL;
+ data.user = 0;
+ /*
+ * could be optimized for case where there is only one node
+ * format in use in the filesystem, probably there are lots
+ * of such places we could optimize for only one node layout.
+ * -Hans
+ */
+ if (data.length > meta_subvol_tree()->nplug->max_item_size()) {
+ /*
+ * This is silly check, but we don't know actual node
+ * where insertion will go into
+ */
+ return RETERR(-ENAMETOOLONG);
+ }
+ /*
+ * oid = oid_allocate(inode->i_sb);
+ * NIKITA-FIXME-HANS: what is your opinion on whether this error
+ * check should be encapsulated into oid_allocate?
+ * if (oid == ABSOLUTE_MAX_OID)
+ * return RETERR(-EOVERFLOW);
+ *
+ * oid had been allocated before grabbing space for the
+ * new stat-data as we need to know id of the subvolume
+ * where this stat-data will be written to. - Edward.
+ */
+ set_inode_oid(inode, oid);
+
+ coord_init_zero(&coord);
+ init_lh(&lh);
+
+ result = insert_by_key(meta_subvol_tree(),
+ build_sd_key(inode, &key), &data, &coord, &lh,
+ /* stat data lives on a leaf level */
+ LEAF_LEVEL, CBK_UNIQUE);
+
+ /* we don't want to re-check that somebody didn't insert
+ stat-data while we were doing io, because if it did,
+ insert_by_key() returned error. */
+ /* but what _is_ possible is that plugin for inode's stat-data,
+ list of non-standard plugins or their state would change
+ during io, so that stat-data wouldn't fit into sd. To avoid
+ this race we keep inode_state lock. This lock has to be
+ taken each time you access inode in a way that would cause
+ changes in sd size: changing plugins etc.
+ */
+
+ if (result == IBK_INSERT_OK) {
+ coord_clear_iplug(&coord);
+ result = zload(coord.node);
+ if (result == 0) {
+ /* have we really inserted stat data? */
+ assert("nikita-725", item_is_statdata(&coord));
+
+ /* inode was just created. It is inserted into hash
+ table, but no directory entry was yet inserted into
+ parent. So, inode is inaccessible through
+ ->lookup(). All places that directly grab inode
+ from hash-table (like old knfsd), should check
+ IMMUTABLE flag that is set by common_create_child.
+ */
+ assert("nikita-3240", data.iplug != NULL);
+ assert("nikita-3241", data.iplug->s.sd.save != NULL);
+ area = item_body_by_coord(&coord);
+ result = data.iplug->s.sd.save(inode, &area);
+ znode_make_dirty(coord.node);
+ if (result == 0) {
+ /* object has stat-data now */
+ reiser4_inode_clr_flag(inode, REISER4_NO_SD);
+ reiser4_inode_set_flag(inode,
+ REISER4_SDLEN_KNOWN);
+ /* initialise stat-data seal */
+ reiser4_seal_init(&ref->sd_seal, &coord, &key);
+ ref->sd_coord = coord;
+ check_inode_seal(inode, &coord, &key);
+ } else if (result != -ENOMEM)
+ /*
+ * convert any other error code to -EIO to
+ * avoid confusing user level with unexpected
+ * errors.
+ */
+ result = RETERR(-EIO);
+ zrelse(coord.node);
+ }
+ }
+ done_lh(&lh);
+
+ if (result != 0)
+ key_warning(&key, inode, result);
+ else
+ oid_count_allocated();
+
+ return result;
+}
+
+/**
+ * Find stat-data in a tree by key.
+ *
+ * Sometimes we are not able to construct precise key to look for
+ * stat-data (specifically, its ordering component is unknown).
+ * In this case we set maximal possible ordering value and perform
+ * lookup with FIND_MAX_NOT_MORE_THAN lookup bias.
+ *
+ *@inode: inode to look stat-data for
+ *@key: key of stat-data
+ *@bias: lookup bias -
+ * FIND_EXACT, if the key is precise,
+ * FIND_MAX_NOT_MORE_THAN, if we don't know ordering component
+ * of the key
+ *@coord: resulting coord
+ *@lh: resulting lock handle
+ */
+int lookup_sd(struct inode *inode, znode_lock_mode lock_mode,
+ coord_t *coord, lock_handle *lh, const reiser4_key *key,
+ lookup_bias bias, int silent)
+{
+ int result;
+ __u32 flags;
+
+ assert("nikita-1692", inode != NULL);
+ assert("nikita-1693", coord != NULL);
+ assert("nikita-1694", key != NULL);
+
+ /* look for the object's stat data in a tree.
+ This returns in "node" pointer to a locked znode and in "pos"
+ position of an item found in node. Both are only valid if
+ coord_found is returned. */
+ flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
+ flags |= CBK_UNIQUE;
+ /*
+ * traverse tree to find stat data. We cannot use vroot here, because
+ * it only covers _body_ of the file, and stat data don't belong
+ * there.
+ */
+ result = coord_by_key(meta_subvol_tree(),
+ key,
+ coord,
+ lh,
+ lock_mode,
+ bias, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
+ if (unlikely(IS_CBKERR(result))) {
+ key_warning(key, inode, result);
+ return result;
+ }
+ if (result == CBK_COORD_FOUND) {
+ check_sd_coord(coord, key);
+ return 0;
+ }
+ /* not found */
+ if (bias == FIND_MAX_NOT_MORE_THAN) {
+ /*
+ * In this mode we don't expect that stat-data,
+ * we are looking for, necessarily exists.
+ */
+ if (coord->between != AFTER_ITEM) {
+ warning("edward-2320",
+ "Unexpected between state (%d)",
+ coord->between);
+ key_warning(key, inode, result);
+ return -EIO;
+ }
+ coord->between = AT_UNIT;
+ coord->unit_pos = 0;
+ return 0;
+ }
+ /* not found by exact key */
+ if (!silent)
+ key_warning(key, inode, result);
+ return result;
+}
+
+static int locate_inode_sd(struct inode *inode,
+ reiser4_key *key, coord_t *coord, lock_handle *lh)
+{
+ reiser4_inode *state;
+ seal_t seal;
+ int result;
+
+ assert("nikita-3483", inode != NULL);
+
+ state = reiser4_inode_data(inode);
+ spin_lock_inode(inode);
+ *coord = state->sd_coord;
+ coord_clear_iplug(coord);
+ seal = state->sd_seal;
+ spin_unlock_inode(inode);
+
+ build_sd_key(inode, key);
+ /* first, try to use seal */
+ if (reiser4_seal_is_set(&seal)) {
+ result = reiser4_seal_validate(&seal,
+ meta_subvol_tree(),
+ coord,
+ key,
+ lh, ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_LOPRI);
+ if (result == 0) {
+ check_sd_coord(coord, key);
+ return 0;
+ }
+ }
+ /* hint is invalid,
+ * so traverse tree
+ */
+ coord_init_zero(coord);
+ return lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key,
+ FIND_EXACT, 0);
+}
+
+#if REISER4_DEBUG
+static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
+{
+ return (get_key_locality(k1) == get_key_locality(k2) &&
+ get_key_type(k1) == get_key_type(k2) &&
+ get_key_band(k1) == get_key_band(k2) &&
+ get_key_ordering(k1) == get_key_ordering(k2) &&
+ get_key_objectid(k1) == get_key_objectid(k2));
+}
+
+#include "../tree_walk.h"
+
+/* make some checks before and after stat-data resize operation */
+static int check_sd_resize(struct inode *inode, coord_t *coord,
+ int length, int progress/* 1 means after resize */)
+{
+ int ret = 0;
+ lock_handle left_lock;
+ coord_t left_coord;
+ reiser4_key left_key;
+ reiser4_key key;
+
+ if (inode_file_plugin(inode) !=
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
+ return 0;
+ if (!length)
+ return 0;
+ if (coord->item_pos != 0)
+ return 0;
+
+ init_lh(&left_lock);
+ ret = reiser4_get_left_neighbor(&left_lock,
+ coord->node,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
+ ret == -ENOENT || ret == -EINVAL
+ || ret == -E_DEADLOCK) {
+ ret = 0;
+ goto exit;
+ }
+ ret = zload(left_lock.node);
+ if (ret)
+ goto exit;
+ coord_init_last_unit(&left_coord, left_lock.node);
+ item_key_by_coord(&left_coord, &left_key);
+ item_key_by_coord(coord, &key);
+
+ if (all_but_offset_key_eq(&key, &left_key))
+ /* corruption occured */
+ ret = 1;
+ zrelse(left_lock.node);
+ exit:
+ done_lh(&left_lock);
+ return ret;
+}
+#endif
+
+/* update stat-data at @coord */
+static int
+update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key,
+ lock_handle * lh)
+{
+ int result;
+ reiser4_item_data data;
+ char *area;
+ reiser4_inode *state;
+ znode *loaded;
+
+ state = reiser4_inode_data(inode);
+
+ coord_clear_iplug(coord);
+ result = zload(coord->node);
+ if (result != 0)
+ return result;
+ loaded = coord->node;
+
+ spin_lock_inode(inode);
+ assert("nikita-728", inode_sd_plugin(inode) != NULL);
+ data.iplug = inode_sd_plugin(inode);
+
+ /* if inode has non-standard plugins, add appropriate stat data
+ * extension */
+ if (state->extmask & (1 << PLUGIN_STAT)) {
+ if (state->plugin_mask == 0)
+ inode_clr_extension(inode, PLUGIN_STAT);
+ } else if (state->plugin_mask != 0)
+ inode_set_extension(inode, PLUGIN_STAT);
+
+ if (state->extmask & (1 << HEIR_STAT)) {
+ if (state->heir_mask == 0)
+ inode_clr_extension(inode, HEIR_STAT);
+ } else if (state->heir_mask != 0)
+ inode_set_extension(inode, HEIR_STAT);
+
+ /* data.length is how much space to add to (or remove
+ from if negative) sd */
+ if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
+ /* recalculate stat-data length */
+ data.length =
+ data.iplug->s.sd.save_len(inode) -
+ item_length_by_coord(coord);
+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
+ } else
+ data.length = 0;
+ spin_unlock_inode(inode);
+
+ /* if on-disk stat data is of different length than required
+ for this inode, resize it */
+
+ if (data.length != 0) {
+ data.data = NULL;
+ data.user = 0;
+
+ assert("edward-1441",
+ !check_sd_resize(inode, coord,
+ data.length, 0/* before resize */));
+
+ /* insertion code requires that insertion point (coord) was
+ * between units. */
+ coord->between = AFTER_UNIT;
+ result = reiser4_resize_item(coord, &data, key, lh,
+ COPI_DONT_SHIFT_LEFT);
+ if (result != 0) {
+ key_warning(key, inode, result);
+ zrelse(loaded);
+ return result;
+ }
+ if (loaded != coord->node) {
+ /* reiser4_resize_item moved coord to another node.
+ Zload it */
+ zrelse(loaded);
+ coord_clear_iplug(coord);
+ result = zload(coord->node);
+ if (result != 0)
+ return result;
+ loaded = coord->node;
+ }
+ assert("edward-1442",
+ !check_sd_resize(inode, coord,
+ data.length, 1/* after resize */));
+ }
+ area = item_body_by_coord(coord);
+ spin_lock_inode(inode);
+ result = data.iplug->s.sd.save(inode, &area);
+ znode_make_dirty(coord->node);
+
+ /* re-initialise stat-data seal */
+
+ /*
+ * coord.between was possibly skewed from AT_UNIT when stat-data size
+ * was changed and new extensions were pasted into item.
+ */
+ coord->between = AT_UNIT;
+ reiser4_seal_init(&state->sd_seal, coord, key);
+ state->sd_coord = *coord;
+ spin_unlock_inode(inode);
+ check_inode_seal(inode, coord, key);
+ zrelse(loaded);
+ return result;
+}
+
+/* Update existing stat-data in a tree. Called with inode state locked. Return
+ inode state locked. */
+static int update_sd(struct inode *inode/* inode to update sd for */)
+{
+ int result;
+ reiser4_key key;
+ coord_t coord;
+ lock_handle lh;
+
+ assert("nikita-726", inode != NULL);
+
+ /* no stat-data, nothing to update?! */
+ assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
+
+ init_lh(&lh);
+
+ result = locate_inode_sd(inode, &key, &coord, &lh);
+ if (result == 0)
+ result = update_sd_at(inode, &coord, &key, &lh);
+ done_lh(&lh);
+
+ return result;
+}
+
+/**
+ * Helper for reiser4_delete_object_common and reiser4_delete_dir_common.
+ * Remove object's body, stat data and safe link from the tree.
+ * Space for that must be reserved by caller before
+ * @inode: object to be deleted
+ */
+static int common_object_delete_no_reserve(struct inode *inode)
+{
+ int result;
+
+ assert("nikita-1477", inode != NULL);
+
+ if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
+ reiser4_key sd_key;
+
+ build_sd_key(inode, &sd_key);
+ result = reiser4_cut_tree(meta_subvol_tree(),
+ &sd_key, &sd_key, NULL, 0);
+ if (result == 0) {
+ reiser4_inode_set_flag(inode, REISER4_NO_SD);
+ result = oid_release(inode->i_sb, get_inode_oid(inode));
+ if (result == 0) {
+ oid_count_released();
+
+ result = safe_link_del(get_meta_subvol(),
+ get_inode_oid(inode),
+ SAFE_UNLINK);
+ }
+ }
+ } else
+ result = 0;
+ return result;
+}
+
+/* helper for safelink_common */
+static int process_truncate(struct inode *inode, __u64 size)
+{
+ int result;
+ struct iattr attr;
+ file_plugin *fplug;
+ reiser4_context *ctx;
+ struct dentry dentry;
+
+ assert("vs-21", is_in_reiser4_context());
+ ctx = reiser4_init_context(inode->i_sb);
+ assert("vs-22", !IS_ERR(ctx));
+
+ attr.ia_size = size;
+ attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
+ fplug = inode_file_plugin(inode);
+
+ inode_lock(inode);
+ assert("vs-1704", get_current_context()->trans->atom == NULL);
+ dentry.d_inode = inode;
+ result = inode->i_op->setattr(&dentry, &attr);
+ inode_unlock(inode);
+
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+
+ return result;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/hash.c linux-5.10.2/fs/reiser4/plugin/hash.c
--- linux-5.10.2.orig/fs/reiser4/plugin/hash.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/hash.c 2020-12-23 16:07:46.125813231 +0100
@@ -0,0 +1,347 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Hash functions */
+
+#include "../debug.h"
+#include "plugin_header.h"
+#include "plugin.h"
+#include "../super.h"
+#include "../inode.h"
+
+#include <linux/types.h>
+
+/* old rupasov (yura) hash */
+static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
+ int len/* @name's length */)
+{
+ int i;
+ int j;
+ int pow;
+ __u64 a;
+ __u64 c;
+
+ assert("nikita-672", name != NULL);
+ assert("nikita-673", len >= 0);
+
+ for (pow = 1, i = 1; i < len; ++i)
+ pow = pow * 10;
+
+ if (len == 1)
+ a = name[0] - 48;
+ else
+ a = (name[0] - 48) * pow;
+
+ for (i = 1; i < len; ++i) {
+ c = name[i] - 48;
+ for (pow = 1, j = i; j < len - 1; ++j)
+ pow = pow * 10;
+ a = a + c * pow;
+ }
+ for (; i < 40; ++i) {
+ c = '0' - 48;
+ for (pow = 1, j = i; j < len - 1; ++j)
+ pow = pow * 10;
+ a = a + c * pow;
+ }
+
+ for (; i < 256; ++i) {
+ c = i;
+ for (pow = 1, j = i; j < len - 1; ++j)
+ pow = pow * 10;
+ a = a + c * pow;
+ }
+
+ a = a << 7;
+ return a;
+}
+
+/* r5 hash */
+static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
+ int len UNUSED_ARG/* @name's length */)
+{
+ __u64 a = 0;
+
+ assert("nikita-674", name != NULL);
+ assert("nikita-675", len >= 0);
+
+ while (*name) {
+ a += *name << 4;
+ a += *name >> 4;
+ a *= 11;
+ name++;
+ }
+ return a;
+}
+
+/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
+ H0 = Key
+ Hi = E Mi(Hi-1) + Hi-1
+
+ (see Applied Cryptography, 2nd edition, p448).
+
+ Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
+
+ Jeremy has agreed to the contents of reiserfs/README. -Hans
+
+ This code was blindly upgraded to __u64 by s/__u32/__u64/g.
+*/
+static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
+ int len/* @name's length */)
+{
+ __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
+
+ __u64 h0 = k[0], h1 = k[1];
+ __u64 a, b, c, d;
+ __u64 pad;
+ int i;
+
+ assert("nikita-676", name != NULL);
+ assert("nikita-677", len >= 0);
+
+#define DELTA 0x9E3779B9u
+#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
+#define PARTROUNDS 6 /* 6 gets complete mixing */
+
+/* a, b, c, d - data; h0, h1 - accumulated hash */
+#define TEACORE(rounds) \
+ do { \
+ __u64 sum = 0; \
+ int n = rounds; \
+ __u64 b0, b1; \
+ \
+ b0 = h0; \
+ b1 = h1; \
+ \
+ do { \
+ sum += DELTA; \
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
+ } while (--n); \
+ \
+ h0 += b0; \
+ h1 += b1; \
+ } while (0)
+
+ pad = (__u64) len | ((__u64) len << 8);
+ pad |= pad << 16;
+
+ while (len >= 16) {
+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
+ 16 | (__u64) name[3] << 24;
+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
+ 16 | (__u64) name[7] << 24;
+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
+ 16 | (__u64) name[11] << 24;
+ d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
+ << 16 | (__u64) name[15] << 24;
+
+ TEACORE(PARTROUNDS);
+
+ len -= 16;
+ name += 16;
+ }
+
+ if (len >= 12) {
+ assert("", len < 16);
+
+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
+ 16 | (__u64) name[3] << 24;
+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
+ 16 | (__u64) name[7] << 24;
+ c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
+ 16 | (__u64) name[11] << 24;
+
+ d = pad;
+ for (i = 12; i < len; i++) {
+ d <<= 8;
+ d |= name[i];
+ }
+ } else if (len >= 8) {
+ assert("", len < 12);
+
+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
+ 16 | (__u64) name[3] << 24;
+ b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
+ 16 | (__u64) name[7] << 24;
+
+ c = d = pad;
+ for (i = 8; i < len; i++) {
+ c <<= 8;
+ c |= name[i];
+ }
+ } else if (len >= 4) {
+ assert("", len < 8);
+
+ a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
+ 16 | (__u64) name[3] << 24;
+
+ b = c = d = pad;
+ for (i = 4; i < len; i++) {
+ b <<= 8;
+ b |= name[i];
+ }
+ } else {
+ assert("", len < 4);
+
+ a = b = c = d = pad;
+ for (i = 0; i < len; i++) {
+ a <<= 8;
+ a |= name[i];
+ }
+ }
+
+ TEACORE(FULLROUNDS);
+
+/* return 0;*/
+ return h0 ^ h1;
+
+}
+
+/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
+
+ See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
+
+ Excerpts:
+
+ FNV hashes are designed to be fast while maintaining a low collision
+ rate.
+
+ [This version also seems to preserve lexicographical order locally.]
+
+ FNV hash algorithms and source code have been released into the public
+ domain.
+
+*/
+static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
+ int len UNUSED_ARG/* @name's length */)
+{
+ unsigned long long a = 0xcbf29ce484222325ull;
+ const unsigned long long fnv_64_prime = 0x100000001b3ull;
+
+ assert("nikita-678", name != NULL);
+ assert("nikita-679", len >= 0);
+
+ /* FNV-1 hash each octet in the buffer */
+ for (; *name; ++name) {
+ /* multiply by the 32 bit FNV magic prime mod 2^64 */
+ a *= fnv_64_prime;
+ /* xor the bottom with the current octet */
+ a ^= (unsigned long long)(*name);
+ }
+ /* return our new hash value */
+ return a;
+}
+
+/* degenerate hash function used to simplify testing of non-unique key
+ handling */
+static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
+ int len UNUSED_ARG/* @name's length */)
+{
+ return 0xc0c0c0c010101010ull;
+}
+
+static int change_hash(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ int result;
+
+ assert("nikita-3503", inode != NULL);
+ assert("nikita-3504", plugin != NULL);
+
+ assert("nikita-3505", is_reiser4_inode(inode));
+ assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
+
+ if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
+ return RETERR(-EINVAL);
+
+ result = 0;
+ if (inode_hash_plugin(inode) == NULL ||
+ inode_hash_plugin(inode)->h.id != plugin->h.id) {
+ if (is_dir_empty(inode) == 0)
+ result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
+ PSET_HASH, plugin);
+ else
+ result = RETERR(-ENOTEMPTY);
+
+ }
+ return result;
+}
+
+static reiser4_plugin_ops hash_plugin_ops = {
+ .init = NULL,
+ .load = NULL,
+ .save_len = NULL,
+ .save = NULL,
+ .change = change_hash
+};
+
+/* hash plugins */
+hash_plugin hash_plugins[LAST_HASH_ID] = {
+ [RUPASOV_HASH_ID] = {
+ .h = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .id = RUPASOV_HASH_ID,
+ .pops = &hash_plugin_ops,
+ .label = "rupasov",
+ .desc = "Original Yura's hash",
+ .linkage = {NULL, NULL}
+ },
+ .hash = hash_rupasov
+ },
+ [R5_HASH_ID] = {
+ .h = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .id = R5_HASH_ID,
+ .pops = &hash_plugin_ops,
+ .label = "r5",
+ .desc = "r5 hash",
+ .linkage = {NULL, NULL}
+ },
+ .hash = hash_r5
+ },
+ [TEA_HASH_ID] = {
+ .h = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .id = TEA_HASH_ID,
+ .pops = &hash_plugin_ops,
+ .label = "tea",
+ .desc = "tea hash",
+ .linkage = {NULL, NULL}
+ },
+ .hash = hash_tea
+ },
+ [FNV1_HASH_ID] = {
+ .h = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .id = FNV1_HASH_ID,
+ .pops = &hash_plugin_ops,
+ .label = "fnv1",
+ .desc = "fnv1 hash",
+ .linkage = {NULL, NULL}
+ },
+ .hash = hash_fnv1
+ },
+ [DEGENERATE_HASH_ID] = {
+ .h = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .id = DEGENERATE_HASH_ID,
+ .pops = &hash_plugin_ops,
+ .label = "degenerate hash",
+ .desc = "Degenerate hash: only for testing",
+ .linkage = {NULL, NULL}
+ },
+ .hash = hash_deg
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/inode_ops.c linux-5.10.2/fs/reiser4/plugin/inode_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/inode_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/inode_ops.c 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,909 @@
+/*
+ * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
+ */
+
+/*
+ * this file contains typical implementations for most of methods of struct
+ * inode_operations
+ */
+
+#include "../inode.h"
+#include "../safe_link.h"
+
+#include <linux/namei.h>
+
+static int create_vfs_object(struct inode *parent, struct dentry *dentry,
+ reiser4_object_create_data *data);
+
+/**
+ * reiser4_create_common - create of inode operations
+ * @parent: inode of parent directory
+ * @dentry: dentry of new object to create
+ * @mode: the permissions to use
+ * @exclusive:
+ *
+ * This is common implementation of vfs's create method of struct
+ * inode_operations.
+ * Creates regular file using file plugin from parent directory plugin set.
+ */
+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
+ umode_t mode, bool exclusive)
+{
+ reiser4_object_create_data data;
+ file_plugin *fplug;
+
+ memset(&data, 0, sizeof data);
+ data.mode = S_IFREG | mode;
+ fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
+ if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
+ warning("vpf-1900", "'%s' is not a regular file plugin.",
+ fplug->h.label);
+ return RETERR(-EIO);
+ }
+ data.id = fplug->h.id;
+ return create_vfs_object(parent, dentry, &data);
+}
+
+int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
+void check_light_weight(struct inode *inode, struct inode *parent);
+
+/**
+ * reiser4_lookup_common - lookup of inode operations
+ * @parent: inode of directory to lookup into
+ * @dentry: name to look for
+ * @flags:
+ *
+ * This is common implementation of vfs's lookup method of struct
+ * inode_operations.
+ */
+struct dentry *reiser4_lookup_common(struct inode *parent,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ reiser4_context *ctx;
+ int result;
+ struct dentry *new;
+ struct inode *inode;
+ reiser4_dir_entry_desc entry;
+
+ ctx = reiser4_init_context(parent->i_sb);
+ if (IS_ERR(ctx))
+ return (struct dentry *)ctx;
+
+ /* set up operations on dentry. */
+ dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
+
+ result = reiser4_lookup_name(parent, dentry, &entry.key);
+ if (result) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ if (result == -ENOENT) {
+ /* object not found */
+ if (!IS_DEADDIR(parent))
+ d_add(dentry, NULL);
+ return NULL;
+ }
+ return ERR_PTR(result);
+ }
+
+ inode = reiser4_iget(parent->i_sb, &entry.key, FIND_EXACT, 0);
+ if (IS_ERR(inode)) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return ERR_PTR(PTR_ERR(inode));
+ }
+
+ /* success */
+ check_light_weight(inode, parent);
+ new = d_splice_alias(inode, dentry);
+ reiser4_iget_complete(inode);
+
+ /* prevent balance_dirty_pages() from being called: we don't want to
+ * do this under directory i_mutex. */
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return new;
+}
+
+static reiser4_block_nr common_estimate_link(struct inode *parent,
+ struct inode *object);
+int reiser4_update_dir(struct inode *);
+
+static inline void reiser4_check_immutable(struct inode *inode)
+{
+ do {
+ if (!reiser4_inode_get_flag(inode, REISER4_IMMUTABLE))
+ break;
+ yield();
+ } while (1);
+}
+
+/**
+ * reiser4_link_common - link of inode operations
+ * @existing: dentry of object which is to get new name
+ * @parent: directory where new name is to be created
+ * @newname: new name
+ *
+ * This is common implementation of vfs's link method of struct
+ * inode_operations.
+ */
+int reiser4_link_common(struct dentry *existing, struct inode *parent,
+ struct dentry *newname)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *object;
+ dir_plugin *parent_dplug;
+ reiser4_dir_entry_desc entry;
+ reiser4_object_create_data data;
+ reiser4_block_nr reserve;
+
+ ctx = reiser4_init_context(parent->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ assert("nikita-1431", existing != NULL);
+ assert("nikita-1432", parent != NULL);
+ assert("nikita-1433", newname != NULL);
+
+ object = existing->d_inode;
+ assert("nikita-1434", object != NULL);
+
+ /* check for race with create_object() */
+ reiser4_check_immutable(object);
+
+ parent_dplug = inode_dir_plugin(parent);
+
+ memset(&entry, 0, sizeof entry);
+ entry.obj = object;
+
+ data.mode = object->i_mode;
+ data.id = inode_file_plugin(object)->h.id;
+
+ reserve = common_estimate_link(parent, existing->d_inode);
+ if ((__s64) reserve < 0) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return reserve;
+ }
+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT, get_meta_subvol())) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return RETERR(-ENOSPC);
+ }
+ /*
+ * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
+ * means that link(2) can race against unlink(2) or rename(2), and
+ * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
+ *
+ * For such inode we have to undo special processing done in
+ * reiser4_unlink() viz. creation of safe-link.
+ */
+ if (unlikely(object->i_nlink == 0)) {
+ result = safe_link_del(get_meta_subvol(),
+ get_inode_oid(object), SAFE_UNLINK);
+ if (result != 0) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+ }
+
+ /* increment nlink of @existing and update its stat data */
+ result = reiser4_add_nlink(object, parent, 1);
+ if (result == 0) {
+ /* add entry to the parent */
+ result =
+ parent_dplug->add_entry(parent, newname, &data, &entry);
+ if (result != 0) {
+ /* failed to add entry to the parent, decrement nlink
+ of @existing */
+ reiser4_del_nlink(object, parent, 1);
+ /*
+ * now, if that failed, we have a file with too big
+ * nlink---space leak, much better than directory
+ * entry pointing to nowhere
+ */
+ }
+ }
+ if (result == 0) {
+ atomic_inc(&object->i_count);
+ /*
+ * Upon successful completion, link() shall mark for update
+ * the st_ctime field of the file. Also, the st_ctime and
+ * st_mtime fields of the directory that contains the new
+ * entry shall be marked for update. --SUS
+ */
+ result = reiser4_update_dir(parent);
+ }
+ if (result == 0)
+ d_instantiate(newname, existing->d_inode);
+
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
+
+/**
+ * reiser4_unlink_common - unlink of inode operations
+ * @parent: inode of directory to remove name from
+ * @victim: name to be removed
+ *
+ * This is common implementation of vfs's unlink method of struct
+ * inode_operations.
+ */
+int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *object;
+ file_plugin *fplug;
+
+ ctx = reiser4_init_context(parent->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ object = victim->d_inode;
+ fplug = inode_file_plugin(object);
+ assert("nikita-2882", fplug->detach != NULL);
+
+ result = unlink_check_and_grab(parent, victim);
+ if (result != 0) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ result = fplug->detach(object, parent);
+ if (result == 0) {
+ dir_plugin *parent_dplug;
+ reiser4_dir_entry_desc entry;
+
+ parent_dplug = inode_dir_plugin(parent);
+ memset(&entry, 0, sizeof entry);
+
+ /* first, delete directory entry */
+ result = parent_dplug->rem_entry(parent, victim, &entry);
+ if (result == 0) {
+ /*
+ * if name was removed successfully, we _have_ to
+ * return 0 from this function, because upper level
+ * caller (vfs_{rmdir,unlink}) expect this.
+ *
+ * now that directory entry is removed, update
+ * stat-data
+ */
+ reiser4_del_nlink(object, parent, 1);
+ /*
+ * Upon successful completion, unlink() shall mark for
+ * update the st_ctime and st_mtime fields of the
+ * parent directory. Also, if the file's link count is
+ * not 0, the st_ctime field of the file shall be
+ * marked for update. --SUS
+ */
+ reiser4_update_dir(parent);
+ /* add safe-link for this file */
+ if (object->i_nlink == 0)
+ safe_link_add(object, SAFE_UNLINK);
+ }
+ }
+
+ if (unlikely(result != 0)) {
+ if (result != -ENOMEM)
+ warning("nikita-3398", "Cannot unlink %llu (%i)",
+ (unsigned long long)get_inode_oid(object),
+ result);
+ /* if operation failed commit pending inode modifications to
+ * the stat-data */
+ reiser4_update_sd(object);
+ reiser4_update_sd(parent);
+ }
+
+ reiser4_release_reserved(object->i_sb);
+
+ /* @object's i_ctime was updated by ->rem_link() method(). */
+
+ /* @victim can be already removed from the disk by this time. Inode is
+ then marked so that iput() wouldn't try to remove stat data. But
+ inode itself is still there.
+ */
+
+ /*
+ * we cannot release directory semaphore here, because name has
+ * already been deleted, but dentry (@victim) still exists. Prevent
+ * balance_dirty_pages() from being called on exiting this context: we
+ * don't want to do this under directory i_mutex.
+ */
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/**
+ * reiser4_symlink_common - symlink of inode operations
+ * @parent: inode of parent directory
+ * @dentry: dentry of object to be created
+ * @linkname: string symlink is to contain
+ *
+ * This is common implementation of vfs's symlink method of struct
+ * inode_operations.
+ * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
+ */
+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
+ const char *linkname)
+{
+ reiser4_object_create_data data;
+
+ memset(&data, 0, sizeof data);
+ data.name = linkname;
+ data.id = SYMLINK_FILE_PLUGIN_ID;
+ data.mode = S_IFLNK | S_IRWXUGO;
+ return create_vfs_object(parent, dentry, &data);
+}
+
+/**
+ * reiser4_mkdir_common - mkdir of inode operations
+ * @parent: inode of parent directory
+ * @dentry: dentry of object to be created
+ * @mode: the permissions to use
+ *
+ * This is common implementation of vfs's mkdir method of struct
+ * inode_operations.
+ * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
+ */
+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, umode_t mode)
+{
+ reiser4_object_create_data data;
+
+ memset(&data, 0, sizeof data);
+ data.mode = S_IFDIR | mode;
+ data.id = DIRECTORY_FILE_PLUGIN_ID;
+ return create_vfs_object(parent, dentry, &data);
+}
+
+/**
+ * reiser4_mknod_common - mknod of inode operations
+ * @parent: inode of parent directory
+ * @dentry: dentry of object to be created
+ * @mode: the permissions to use and file type
+ * @rdev: minor and major of new device file
+ *
+ * This is common implementation of vfs's mknod method of struct
+ * inode_operations.
+ * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
+ */
+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ reiser4_object_create_data data;
+
+ memset(&data, 0, sizeof data);
+ data.mode = mode;
+ data.rdev = rdev;
+ data.id = SPECIAL_FILE_PLUGIN_ID;
+ return create_vfs_object(parent, dentry, &data);
+}
+
+/*
+ * implementation of vfs's rename method of struct inode_operations for typical
+ * directory is in inode_ops_rename.c
+ */
+
+/**
+ * reiser4_get_link_common: ->get_link() of inode_operations
+ * @dentry: dentry of symlink
+ *
+ * Assumes that inode's i_private points to the content of symbolic link.
+ */
+const char *reiser4_get_link_common(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
+
+ if (!dentry->d_inode->i_private ||
+ !reiser4_inode_get_flag(dentry->d_inode, REISER4_GENERIC_PTR_USED))
+ return ERR_PTR(RETERR(-EINVAL));
+
+ return dentry->d_inode->i_private;
+}
+
+/**
+ * reiser4_permission_common - permission of inode operations
+ * @inode: inode to check permissions for
+ * @mask: mode bits to check permissions for
+ * @flags:
+ *
+ * Uses generic function to check for rwx permissions.
+ */
+int reiser4_permission_common(struct inode *inode, int mask)
+{
+ // generic_permission() says that it's rcu-aware...
+#if 0
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+#endif
+ return generic_permission(inode, mask);
+}
+
+static int setattr_reserve(struct inode *);
+
+/* this is common implementation of vfs's setattr method of struct
+ inode_operations
+*/
+int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
+{
+ reiser4_context *ctx;
+ struct inode *inode;
+ int result;
+
+ inode = dentry->d_inode;
+ result = setattr_prepare(dentry, attr);
+ if (result)
+ return result;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
+
+ /*
+ * grab disk space and call standard
+ * setattr_copy();
+ * mark_inode_dirty().
+ */
+ result = setattr_reserve(inode);
+ if (!result) {
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ result = reiser4_update_sd(inode);
+ }
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/* this is common implementation of vfs's getattr method of struct
+ inode_operations
+*/
+int reiser4_getattr_common(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *obj;
+
+ assert("nikita-2298", path != NULL);
+ assert("nikita-2299", stat != NULL);
+
+ obj = d_inode(path->dentry);
+
+ stat->dev = obj->i_sb->s_dev;
+ stat->ino = oid_to_uino(get_inode_oid(obj));
+ stat->mode = obj->i_mode;
+ /* don't confuse userland with huge nlink. This is not entirely
+ * correct, because nlink_t is not necessary 16 bit signed. */
+ stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
+ stat->uid = obj->i_uid;
+ stat->gid = obj->i_gid;
+ stat->rdev = obj->i_rdev;
+ stat->atime = obj->i_atime;
+ stat->mtime = obj->i_mtime;
+ stat->ctime = obj->i_ctime;
+ stat->size = obj->i_size;
+ stat->blocks =
+ (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
+ /* "preferred" blocksize for efficient file system I/O */
+ stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
+
+ return 0;
+}
+
+/* Estimate the maximum amount of nodes which might be allocated or changed on
+ typical new object creation. Typical creation consists of calling create
+ method of file plugin, adding directory entry to parent and update parent
+ directory's stat data.
+*/
+static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,
+ /* parent object */
+ struct inode *object
+ /* object */)
+{
+ assert("vpf-309", parent != NULL);
+ assert("vpf-307", object != NULL);
+
+ return
+ /* object creation estimation */
+ inode_file_plugin(object)->estimate.create(object) +
+ /* stat data of parent directory estimation */
+ inode_file_plugin(parent)->estimate.update(parent) +
+ /* adding entry estimation */
+ inode_dir_plugin(parent)->estimate.add_entry(parent) +
+ /* to undo in the case of failure */
+ inode_dir_plugin(parent)->estimate.rem_entry(parent);
+}
+
+/**
+ * Create child in a directory.
+ *
+ * . get object's plugin
+ * . get fresh inode
+ * . initialize inode
+ * . add object's stat-data
+ * . initialize object's directory
+ * . add entry to the parent
+ * . instantiate dentry
+ *
+ * @data - parameters of new object
+ */
+static int do_create_vfs_child(reiser4_object_create_data *data,
+ struct inode **retobj)
+{
+ int result;
+
+ struct dentry *dentry; /* parent object */
+ struct inode *parent; /* new name */
+ oid_t oid; /* new object id */
+
+ dir_plugin *par_dir; /* directory plugin on the parent */
+ dir_plugin *obj_dir; /* directory plugin on the new object */
+ file_plugin *obj_plug; /* object plugin on the new object */
+ struct inode *object; /* new object */
+
+ reiser4_dir_entry_desc entry; /* new directory entry */
+
+ assert("nikita-1420", data != NULL);
+ parent = data->parent;
+ dentry = data->dentry;
+
+ assert("nikita-1418", parent != NULL);
+ assert("nikita-1419", dentry != NULL);
+
+ /* check, that name is acceptable for parent */
+ par_dir = inode_dir_plugin(parent);
+ if (par_dir->is_name_acceptable &&
+ !par_dir->is_name_acceptable(parent,
+ dentry->d_name.name,
+ (int)dentry->d_name.len))
+ return RETERR(-ENAMETOOLONG);
+
+ result = 0;
+ obj_plug = file_plugin_by_id((int)data->id);
+ if (obj_plug == NULL) {
+ warning("nikita-430", "Cannot find plugin %i", data->id);
+ return RETERR(-ENOENT);
+ }
+ /*
+ * allocate object id for the new object
+ */
+ oid = oid_allocate(parent->i_sb);
+ if (oid == ABSOLUTE_MAX_OID)
+ return RETERR(-EOVERFLOW);
+
+ object = new_inode(parent->i_sb);
+ if (object == NULL)
+ return RETERR(-ENOMEM);
+ /*
+ * new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
+ * to simplify error handling: if some error occurs before i_ino is
+ * initialized with oid, i_ino should already be set to some
+ * distinguished value
+ */
+ object->i_ino = 0;
+
+ /* So that on error iput will be called. */
+ *retobj = object;
+
+ memset(&entry, 0, sizeof entry);
+ entry.obj = object;
+
+ set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
+ file_plugin_to_plugin(obj_plug));
+
+ result = obj_plug->set_plug_in_inode(object, parent, data);
+ if (result) {
+ warning("nikita-431", "Cannot install plugin %i on %llx",
+ data->id, (unsigned long long)get_inode_oid(object));
+ return result;
+ }
+ /*
+ * reget plugin after installation
+ */
+ obj_plug = inode_file_plugin(object);
+
+ if (obj_plug->create_object == NULL)
+ return RETERR(-EPERM);
+ /*
+ * if any of hash, tail, sd or permission plugins for newly created
+ * object are not set yet set them here inheriting them from parent
+ * directory
+ */
+ assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
+ result = obj_plug->adjust_to_parent(object,
+ parent,
+ object->i_sb->s_root->d_inode);
+ if (result == 0)
+ result = finish_pset(object);
+ if (result != 0) {
+ warning("nikita-432", "Cannot inherit from %llx to %llx",
+ (unsigned long long)get_inode_oid(parent),
+ (unsigned long long)get_inode_oid(object));
+ return result;
+ }
+ /*
+ * setup inode and file-operations for this inode
+ */
+ setup_inode_ops(object, data);
+ /*
+ * call file plugin's method to initialize plugin specific part of
+ * inode
+ */
+ if (obj_plug->init_inode_data)
+ obj_plug->init_inode_data(object, data, NULL, 1 /*create */);
+ /*
+ * obtain directory plugin (if any) for new object
+ */
+ obj_dir = inode_dir_plugin(object);
+ if (obj_dir != NULL && obj_dir->init == NULL)
+ return RETERR(-EPERM);
+ reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
+
+ if (reiser4_grab_space(estimate_create_vfs_object(parent, object),
+ BA_CAN_COMMIT, get_meta_subvol()))
+ return RETERR(-ENOSPC);
+ /*
+ mark inode `immutable'. We disable changes to the file being
+ created until valid directory entry for it is inserted. Otherwise,
+ if file were expanded and insertion of directory entry fails, we
+ have to remove file, but we only alloted enough space in
+ transaction to remove _empty_ file. 3.x code used to remove stat
+ data in different transaction thus possibly leaking disk space on
+ crash. This all only matters if it's possible to access file
+ without name, for example, by inode number
+ */
+ reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
+
+ /* create empty object, this includes allocation of new objectid. For
+ directories this implies creation of dot and dotdot */
+ assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
+
+ /* mark inode as `loaded'. From this point onward
+ reiser4_delete_inode() will try to remove its stat-data. */
+ reiser4_inode_set_flag(object, REISER4_LOADED);
+
+ result = obj_plug->create_object(object, parent, data, &oid);
+ if (result != 0) {
+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
+ if (result != -ENAMETOOLONG && result != -ENOMEM)
+ warning("nikita-2219",
+ "Failed to create sd for %llu",
+ (unsigned long long)get_inode_oid(object));
+ return result;
+ }
+
+ if (obj_dir != NULL)
+ result = obj_dir->init(object, parent, data);
+ if (result == 0) {
+ assert("nikita-434", !reiser4_inode_get_flag(object,
+ REISER4_NO_SD));
+ /* insert inode into VFS hash table */
+ insert_inode_hash(object);
+ /* create entry */
+ result = par_dir->add_entry(parent, dentry, data, &entry);
+ if (result == 0) {
+ /* If O_CREAT is set and the file did not previously
+ exist, upon successful completion, open() shall
+ mark for update the st_atime, st_ctime, and
+ st_mtime fields of the file and the st_ctime and
+ st_mtime fields of the parent directory. --SUS
+ */
+ object->i_ctime = current_time(object);
+ reiser4_update_dir(parent);
+ }
+ if (result != 0)
+ /* cleanup failure to add entry */
+ obj_plug->detach(object, parent);
+ } else if (result != -ENOMEM)
+ warning("nikita-2219", "Failed to initialize dir for %llu: %i",
+ (unsigned long long)get_inode_oid(object), result);
+
+ /*
+ * update stat-data, committing all pending modifications to the inode
+ * fields.
+ */
+ reiser4_update_sd(object);
+ if (result != 0) {
+ /* if everything was ok (result == 0), parent stat-data is
+ * already updated above (update_parent_dir()) */
+ reiser4_update_sd(parent);
+ /* failure to create entry, remove object */
+ obj_plug->delete_object(object);
+ }
+
+ /* file has name now, clear immutable flag */
+ reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
+
+ /* on error, iput() will call ->delete_inode(). We should keep track
+ of the existence of stat-data for this inode and avoid attempt to
+ remove it in reiser4_delete_inode(). This is accomplished through
+ REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
+ */
+ return result;
+}
+
+/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
+ reiser4_mknod and reiser4_symlink
+*/
+static int
+create_vfs_object(struct inode *parent,
+ struct dentry *dentry, reiser4_object_create_data * data)
+{
+ reiser4_context *ctx;
+ int result;
+ struct inode *child;
+
+ ctx = reiser4_init_context(parent->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ context_set_commit_async(ctx);
+
+ data->parent = parent;
+ data->dentry = dentry;
+ child = NULL;
+ result = do_create_vfs_child(data, &child);
+ if (unlikely(result != 0)) {
+ if (child != NULL) {
+ /* for unlinked inode accounting in iput() */
+ clear_nlink(child);
+ reiser4_make_bad_inode(child);
+ iput(child);
+ }
+ } else
+ d_instantiate(dentry, child);
+
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+/**
+ * helper for link_common. Estimate disk space necessary to add a link
+ * from @parent to @object
+ */
+static reiser4_block_nr common_estimate_link(struct inode *parent /* parent
+ * directory
+ */,
+ struct inode *object /* object to
+ * which new
+ * link is
+ * being
+ * created */)
+{
+ reiser4_block_nr res = 0;
+ file_plugin *fplug;
+ dir_plugin *dplug;
+
+ assert("vpf-317", object != NULL);
+ assert("vpf-318", parent != NULL);
+
+ fplug = inode_file_plugin(object);
+ dplug = inode_dir_plugin(parent);
+ /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice
+ * instead of multiplying by 2? */
+ /* reiser4_add_nlink(object) */
+ res += fplug->estimate.update(object);
+ /* add_entry(parent) */
+ res += dplug->estimate.add_entry(parent);
+ /* reiser4_del_nlink(object) */
+ res += fplug->estimate.update(object);
+ /* update_dir(parent) */
+ res += inode_file_plugin(parent)->estimate.update(parent);
+ /* safe-link */
+ res += estimate_one_item_removal(meta_subvol_tree());
+
+ return res;
+}
+
+/* Estimate disk space necessary to remove a link between @parent and
+ @object.
+*/
+static reiser4_block_nr estimate_unlink(struct inode *parent /* parent
+ * directory */,
+ struct inode *object /* object to which
+ * new link is
+ * being created
+ */)
+{
+ reiser4_block_nr res = 0;
+ file_plugin *fplug;
+ dir_plugin *dplug;
+
+ assert("vpf-317", object != NULL);
+ assert("vpf-318", parent != NULL);
+
+ fplug = inode_file_plugin(object);
+ dplug = inode_dir_plugin(parent);
+
+ /* rem_entry(parent) */
+ res += dplug->estimate.rem_entry(parent);
+ /* reiser4_del_nlink(object) */
+ res += fplug->estimate.update(object);
+ /* update_dir(parent) */
+ res += inode_file_plugin(parent)->estimate.update(parent);
+ /* fplug->unlink */
+ res += fplug->estimate.unlink(object, parent);
+ /* safe-link */
+ res += estimate_one_insert_item(meta_subvol_tree());
+
+ return res;
+}
+
+/**
+ * helper for reiser4_unlink_common. Estimate and grab space for unlink
+ */
+static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
+{
+ file_plugin *fplug;
+ struct inode *child;
+ int result;
+
+ result = 0;
+ child = victim->d_inode;
+ fplug = inode_file_plugin(child);
+
+ /* check for race with create_object() */
+ reiser4_check_immutable(child);
+
+ /* object being deleted should have stat data */
+ assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
+
+ /* ask object plugin */
+ if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
+ return RETERR(-ENOTEMPTY);
+
+ result = (int)estimate_unlink(parent, child);
+ if (result < 0)
+ return result;
+
+ return reiser4_grab_reserved(child->i_sb, result,
+ BA_CAN_COMMIT, get_meta_subvol());
+}
+
+/**
+ * Helper for reiser4_setattr_common;
+ * Reserve space for stat-data update
+ */
+static int setattr_reserve(struct inode *inode)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+
+ assert("edward-1793", subv != NULL);
+ assert("vs-1096", is_grab_enabled(get_current_context()));
+
+ return reiser4_grab_space(estimate_one_insert_into_item(&subv->tree),
+ BA_CAN_COMMIT, subv);
+}
+
+/* helper function. Standards require that for many file-system operations
+ on success ctime and mtime of parent directory is to be updated. */
+int reiser4_update_dir(struct inode *dir)
+{
+ assert("nikita-2525", dir != NULL);
+
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ return reiser4_update_sd(dir);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/inode_ops_rename.c linux-5.10.2/fs/reiser4/plugin/inode_ops_rename.c
--- linux-5.10.2.orig/fs/reiser4/plugin/inode_ops_rename.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/inode_ops_rename.c 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,957 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "../inode.h"
+#include "../safe_link.h"
+
+static const char *possible_leak = "Possible disk space leak.";
+
+/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
+
+ Helper function called from hashed_rename() */
+static int replace_name(struct inode *to_inode, /* inode where @from_coord is
+ * to be re-targeted at */
+ struct inode *from_dir, /* directory where @from_coord
+ * lives */
+ struct inode *from_inode, /* inode @from_coord
+ * originally point to */
+ coord_t *from_coord, /* where directory entry is in
+ * the tree */
+ lock_handle * from_lh/* lock handle on @from_coord */)
+{
+ item_plugin *from_item;
+ int result;
+ znode *node;
+
+ coord_clear_iplug(from_coord);
+ node = from_coord->node;
+ result = zload(node);
+ if (result != 0)
+ return result;
+ from_item = item_plugin_by_coord(from_coord);
+ if (plugin_of_group(item_plugin_by_coord(from_coord),
+ DIR_ENTRY_ITEM_TYPE)) {
+ reiser4_key to_key;
+
+ build_sd_key(to_inode, &to_key);
+
+ /* everything is found and prepared to change directory entry
+ at @from_coord to point to @to_inode.
+
+ @to_inode is just about to get new name, so bump its link
+ counter.
+
+ */
+ result = reiser4_add_nlink(to_inode, from_dir, 0);
+ if (result != 0) {
+ /* Don't issue warning: this may be plain -EMLINK */
+ zrelse(node);
+ return result;
+ }
+
+ result =
+ from_item->s.dir.update_key(from_coord, &to_key, from_lh);
+ if (result != 0) {
+ reiser4_del_nlink(to_inode, from_dir, 0);
+ zrelse(node);
+ return result;
+ }
+
+ /* @from_inode just lost its name, he-he.
+
+ If @from_inode was directory, it contained dotdot pointing
+ to @from_dir. @from_dir i_nlink will be decreased when
+ iput() will be called on @from_inode.
+
+ If file-system is not ADG (hard-links are
+ supported on directories), iput(from_inode) will not remove
+ @from_inode, and thus above is incorrect, but hard-links on
+ directories are problematic in many other respects.
+ */
+ result = reiser4_del_nlink(from_inode, from_dir, 0);
+ if (result != 0) {
+ warning("nikita-2330",
+ "Cannot remove link from source: %i. %s",
+ result, possible_leak);
+ }
+ /* Has to return success, because entry is already
+ * modified. */
+ result = 0;
+
+ /* NOTE-NIKITA consider calling plugin method in stead of
+ accessing inode fields directly. */
+ from_dir->i_mtime = current_time(from_dir);
+ } else {
+ warning("nikita-2326", "Unexpected item type");
+ result = RETERR(-EIO);
+ }
+ zrelse(node);
+ return result;
+}
+
+/* add new entry pointing to @inode into @dir at @coord, locked by @lh
+
+ Helper function used by hashed_rename(). */
+static int add_name(struct inode *inode, /* inode where @coord is to be
+ * re-targeted at */
+ struct inode *dir, /* directory where @coord lives */
+ struct dentry *name, /* new name */
+ coord_t *coord, /* where directory entry is in the tree
+ */
+ lock_handle * lh, /* lock handle on @coord */
+ int is_dir/* true, if @inode is directory */)
+{
+ int result;
+ reiser4_dir_entry_desc entry;
+
+ assert("nikita-2333", lh->node == coord->node);
+ assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
+
+ memset(&entry, 0, sizeof entry);
+ entry.obj = inode;
+ /* build key of directory entry description */
+ inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
+
+ /* ext2 does this in different order: first inserts new entry,
+ then increases directory nlink. We don't want do this,
+ because reiser4_add_nlink() calls ->add_link() plugin
+ method that can fail for whatever reason, leaving as with
+ cleanup problems.
+ */
+ /* @inode is getting new name */
+ reiser4_add_nlink(inode, dir, 0);
+ /* create @new_name in @new_dir pointing to
+ @old_inode */
+ result = WITH_COORD(coord,
+ inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
+ coord,
+ lh,
+ name,
+ &entry));
+ if (result != 0) {
+ int result2;
+ result2 = reiser4_del_nlink(inode, dir, 0);
+ if (result2 != 0) {
+ warning("nikita-2327",
+ "Cannot drop link on %lli %i. %s",
+ (unsigned long long)get_inode_oid(inode),
+ result2, possible_leak);
+ }
+ } else
+ INODE_INC_FIELD(dir, i_size);
+ return result;
+}
+
+static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory
+ * where @old is
+ * located */
+ struct dentry *old_name,/* old name */
+ struct inode *new_dir, /* directory
+ * where @new is
+ * located */
+ struct dentry *new_name /* new name */)
+{
+ reiser4_block_nr res1, res2;
+ dir_plugin * p_parent_old, *p_parent_new;
+ file_plugin * p_child_old, *p_child_new;
+
+ assert("vpf-311", old_dir != NULL);
+ assert("vpf-312", new_dir != NULL);
+ assert("vpf-313", old_name != NULL);
+ assert("vpf-314", new_name != NULL);
+
+ p_parent_old = inode_dir_plugin(old_dir);
+ p_parent_new = inode_dir_plugin(new_dir);
+ p_child_old = inode_file_plugin(old_name->d_inode);
+ if (new_name->d_inode)
+ p_child_new = inode_file_plugin(new_name->d_inode);
+ else
+ p_child_new = NULL;
+
+ /* find_entry - can insert one leaf. */
+ res1 = res2 = 1;
+
+ /* replace_name */
+ {
+ /* reiser4_add_nlink(p_child_old) and
+ * reiser4_del_nlink(p_child_old) */
+ res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
+ /* update key */
+ res1 += 1;
+ /* reiser4_del_nlink(p_child_new) */
+ if (p_child_new)
+ res1 += p_child_new->estimate.update(new_name->d_inode);
+ }
+
+ /* else add_name */
+ {
+ /* reiser4_add_nlink(p_parent_new) and
+ * reiser4_del_nlink(p_parent_new) */
+ res2 +=
+ 2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
+ /* reiser4_add_nlink(p_parent_old) */
+ res2 += p_child_old->estimate.update(old_name->d_inode);
+ /* add_entry(p_parent_new) */
+ res2 += p_parent_new->estimate.add_entry(new_dir);
+ /* reiser4_del_nlink(p_parent_old) */
+ res2 += p_child_old->estimate.update(old_name->d_inode);
+ }
+
+ res1 = res1 < res2 ? res2 : res1;
+
+ /* reiser4_write_sd(p_parent_new) */
+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
+
+ /* reiser4_write_sd(p_child_new) */
+ if (p_child_new)
+ res1 += p_child_new->estimate.update(new_name->d_inode);
+
+ /* hashed_rem_entry(p_parent_old) */
+ res1 += p_parent_old->estimate.rem_entry(old_dir);
+
+ /* reiser4_del_nlink(p_child_old) */
+ res1 += p_child_old->estimate.update(old_name->d_inode);
+
+ /* replace_name */
+ {
+ /* reiser4_add_nlink(p_parent_dir_new) */
+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
+ /* update_key */
+ res1 += 1;
+ /* reiser4_del_nlink(p_parent_new) */
+ res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
+ /* reiser4_del_nlink(p_parent_old) */
+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
+ }
+
+ /* reiser4_write_sd(p_parent_old) */
+ res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
+
+ /* reiser4_write_sd(p_child_old) */
+ res1 += p_child_old->estimate.update(old_name->d_inode);
+
+ return res1;
+}
+
+static int hashed_rename_estimate_and_grab(struct inode *old_dir, /* directory
+ * where @old
+ * is located
+ */
+ struct dentry *old_name,/* old name
+ */
+ struct inode *new_dir, /* directory
+ * where @new
+ * is located
+ */
+ struct dentry *new_name /* new name
+ */)
+{
+ reiser4_block_nr reserve;
+
+ reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
+
+ if (reiser4_grab_space(reserve, BA_CAN_COMMIT, get_meta_subvol()))
+ return RETERR(-ENOSPC);
+ return 0;
+}
+
+/* check whether @old_inode and @new_inode can be moved within file system
+ * tree. This singles out attempts to rename pseudo-files, for example. */
+static int can_rename(struct inode *old_dir, struct inode *old_inode,
+ struct inode *new_dir, struct inode *new_inode)
+{
+ file_plugin *fplug;
+ dir_plugin *dplug;
+
+ assert("nikita-3370", old_inode != NULL);
+
+ dplug = inode_dir_plugin(new_dir);
+ fplug = inode_file_plugin(old_inode);
+
+ if (dplug == NULL)
+ return RETERR(-ENOTDIR);
+ else if (new_dir->i_op->create == NULL)
+ return RETERR(-EPERM);
+ else if (!fplug->can_add_link(old_inode))
+ return RETERR(-EMLINK);
+ else if (new_inode != NULL) {
+ fplug = inode_file_plugin(new_inode);
+ if (fplug->can_rem_link != NULL &&
+ !fplug->can_rem_link(new_inode))
+ return RETERR(-EBUSY);
+ }
+ return 0;
+}
+
+int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * ,
+ znode_lock_mode, reiser4_dir_entry_desc *);
+int reiser4_update_dir(struct inode *);
+
+/* this is common implementation of vfs's rename2 method of struct
+ inode_operations
+ See comments in the body.
+
+ It is arguable that this function can be made generic so, that it
+ will be applicable to any kind of directory plugin that deals with
+ directories composed out of directory entries. The only obstacle
+ here is that we don't have any data-type to represent directory
+ entry. This should be re-considered when more than one different
+ directory plugin will be implemented.
+*/
+int reiser4_rename2_common(struct inode *old_dir /* directory where @old
+ * is located */ ,
+ struct dentry *old_name /* old name */ ,
+ struct inode *new_dir /* directory where @new
+ * is located */ ,
+ struct dentry *new_name /* new name */ ,
+ unsigned flags /* specific flags */)
+{
+ /* From `The Open Group Base Specifications Issue 6'
+
+ If either the old or new argument names a symbolic link, rename()
+ shall operate on the symbolic link itself, and shall not resolve
+ the last component of the argument. If the old argument and the new
+ argument resolve to the same existing file, rename() shall return
+ successfully and perform no other action.
+
+ [this is done by VFS: vfs_rename()]
+
+ If the old argument points to the pathname of a file that is not a
+ directory, the new argument shall not point to the pathname of a
+ directory.
+
+ [checked by VFS: vfs_rename->may_delete()]
+
+ If the link named by the new argument exists, it shall
+ be removed and old renamed to new. In this case, a link named new
+ shall remain visible to other processes throughout the renaming
+ operation and refer either to the file referred to by new or old
+ before the operation began.
+
+ [we should assure this]
+
+ Write access permission is required for
+ both the directory containing old and the directory containing new.
+
+ [checked by VFS: vfs_rename->may_delete(), may_create()]
+
+ If the old argument points to the pathname of a directory, the new
+ argument shall not point to the pathname of a file that is not a
+ directory.
+
+ [checked by VFS: vfs_rename->may_delete()]
+
+ If the directory named by the new argument exists, it
+ shall be removed and old renamed to new. In this case, a link named
+ new shall exist throughout the renaming operation and shall refer
+ either to the directory referred to by new or old before the
+ operation began.
+
+ [we should assure this]
+
+ If new names an existing directory, it shall be
+ required to be an empty directory.
+
+ [we should check this]
+
+ If the old argument points to a pathname of a symbolic link, the
+ symbolic link shall be renamed. If the new argument points to a
+ pathname of a symbolic link, the symbolic link shall be removed.
+
+ The new pathname shall not contain a path prefix that names
+ old. Write access permission is required for the directory
+ containing old and the directory containing new. If the old
+ argument points to the pathname of a directory, write access
+ permission may be required for the directory named by old, and, if
+ it exists, the directory named by new.
+
+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
+
+ If the link named by the new argument exists and the file's link
+ count becomes 0 when it is removed and no process has the file
+ open, the space occupied by the file shall be freed and the file
+ shall no longer be accessible. If one or more processes have the
+ file open when the last link is removed, the link shall be removed
+ before rename() returns, but the removal of the file contents shall
+ be postponed until all references to the file are closed.
+
+ [iput() handles this, but we can do this manually, a la
+ reiser4_unlink()]
+
+ Upon successful completion, rename() shall mark for update the
+ st_ctime and st_mtime fields of the parent directory of each file.
+
+ [N/A]
+
+ */
+
+ /* From Documentation/filesystems/vfs.txt:
+
+ rename2: this has an additional flags argument compared to rename.
+ f no flags are supported by the filesystem then this method
+ need not be implemented. If some flags are supported then the
+ filesystem must return -EINVAL for any unsupported or unknown
+ flags. Currently the following flags are implemented:
+ (1) RENAME_NOREPLACE: this flag indicates that if the target
+ of the rename exists the rename should fail with -EEXIST
+ instead of replacing the target. The VFS already checks for
+ existence, so for local filesystems the RENAME_NOREPLACE
+ implementation is equivalent to plain rename.
+ (2) RENAME_EXCHANGE: exchange source and target. Both must
+ exist; this is checked by the VFS. Unlike plain rename,
+ source and target may be of different type.
+ */
+
+ static const unsigned supported_flags = RENAME_NOREPLACE;
+
+ reiser4_context *ctx;
+ int result;
+ int is_dir; /* is @old_name directory */
+
+ struct inode *old_inode;
+ struct inode *new_inode;
+ coord_t *new_coord;
+
+ struct reiser4_dentry_fsdata *new_fsdata;
+ dir_plugin *dplug;
+ file_plugin *fplug;
+
+ reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
+ lock_handle * new_lh, *dotdot_lh;
+ struct dentry *dotdot_name;
+ struct reiser4_dentry_fsdata *dataonstack;
+
+ ctx = reiser4_init_context(old_dir->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ /*
+ * Check rename2() flags.
+ *
+ * "If some flags are supported then the filesystem must return
+ * -EINVAL for any unsupported or unknown flags."
+ *
+ * We support:
+ * - RENAME_NOREPLACE (no-op)
+ */
+ if ((flags & supported_flags) != flags)
+ return RETERR(-EINVAL);
+
+ old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
+ sizeof(*dotdot_name) + sizeof(*dataonstack),
+ reiser4_ctx_gfp_mask_get());
+ if (!old_entry) {
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return RETERR(-ENOMEM);
+ }
+
+ new_entry = old_entry + 1;
+ dotdot_entry = old_entry + 2;
+ new_lh = (lock_handle *)(old_entry + 3);
+ dotdot_lh = new_lh + 1;
+ dotdot_name = (struct dentry *)(new_lh + 2);
+ dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
+
+ assert("nikita-2318", old_dir != NULL);
+ assert("nikita-2319", new_dir != NULL);
+ assert("nikita-2320", old_name != NULL);
+ assert("nikita-2321", new_name != NULL);
+
+ old_inode = old_name->d_inode;
+ new_inode = new_name->d_inode;
+
+ dplug = inode_dir_plugin(old_dir);
+ fplug = NULL;
+
+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
+ if (IS_ERR(new_fsdata)) {
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return PTR_ERR(new_fsdata);
+ }
+
+ new_coord = &new_fsdata->dec.entry_coord;
+ coord_clear_iplug(new_coord);
+
+ is_dir = S_ISDIR(old_inode->i_mode);
+
+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
+
+ /* if target is existing directory and it's not empty---return error.
+
+ This check is done specifically, because is_dir_empty() requires
+ tree traversal and have to be done before locks are taken.
+ */
+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return RETERR(-ENOTEMPTY);
+ }
+
+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
+ if (result != 0) {
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
+ new_dir, new_name);
+ if (result != 0) {
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ init_lh(new_lh);
+
+ /* find entry for @new_name */
+ result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
+ new_entry);
+
+ if (IS_CBKERR(result)) {
+ done_lh(new_lh);
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+ }
+
+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
+
+ /* add or replace name for @old_inode as @new_name */
+ if (new_inode != NULL) {
+ /* target (@new_name) exists. */
+ /* Not clear what to do with objects that are
+ both directories and files at the same time. */
+ if (result == CBK_COORD_FOUND) {
+ result = replace_name(old_inode,
+ new_dir,
+ new_inode, new_coord, new_lh);
+ if (result == 0)
+ fplug = inode_file_plugin(new_inode);
+ } else if (result == CBK_COORD_NOTFOUND) {
+ /* VFS told us that @new_name is bound to existing
+ inode, but we failed to find directory entry. */
+ warning("nikita-2324", "Target not found");
+ result = RETERR(-ENOENT);
+ }
+ } else {
+ /* target (@new_name) doesn't exists. */
+ if (result == CBK_COORD_NOTFOUND)
+ result = add_name(old_inode,
+ new_dir,
+ new_name, new_coord, new_lh, is_dir);
+ else if (result == CBK_COORD_FOUND) {
+ /* VFS told us that @new_name is "negative" dentry,
+ but we found directory entry. */
+ warning("nikita-2331", "Target found unexpectedly");
+ result = RETERR(-EIO);
+ }
+ }
+
+ assert("nikita-3462", ergo(result == 0,
+ old_inode->i_nlink >= 2 + !!is_dir));
+
+ /* We are done with all modifications to the @new_dir, release lock on
+ node. */
+ done_lh(new_lh);
+
+ if (fplug != NULL) {
+ /* detach @new_inode from name-space */
+ result = fplug->detach(new_inode, new_dir);
+ if (result != 0)
+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
+ (unsigned long long)get_inode_oid(new_inode),
+ result, possible_leak);
+ }
+
+ if (new_inode != NULL)
+ reiser4_update_sd(new_inode);
+
+ if (result == 0) {
+ old_entry->obj = old_inode;
+
+ dplug->build_entry_key(old_dir,
+ &old_name->d_name, &old_entry->key);
+
+ /* At this stage new name was introduced for
+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
+ counters were updated.
+
+ We want to remove @old_name now. If @old_inode wasn't
+ directory this is simple.
+ */
+ result = dplug->rem_entry(old_dir, old_name, old_entry);
+ if (result != 0 && result != -ENOMEM) {
+ warning("nikita-2335",
+ "Cannot remove old name: %i", result);
+ } else {
+ result = reiser4_del_nlink(old_inode, old_dir, 0);
+ if (result != 0 && result != -ENOMEM) {
+ warning("nikita-2337",
+ "Cannot drop link on old: %i", result);
+ }
+ }
+
+ if (result == 0 && is_dir) {
+ /* @old_inode is directory. We also have to update
+ dotdot entry. */
+ coord_t *dotdot_coord;
+
+ memset(dataonstack, 0, sizeof(*dataonstack));
+ memset(dotdot_entry, 0, sizeof(*dotdot_entry));
+ dotdot_entry->obj = old_dir;
+ memset(dotdot_name, 0, sizeof(*dotdot_name));
+ dotdot_name->d_name.name = "..";
+ dotdot_name->d_name.len = 2;
+ /*
+ * allocate ->d_fsdata on the stack to avoid using
+ * reiser4_get_dentry_fsdata(). Locking is not needed,
+ * because dentry is private to the current thread.
+ */
+ dotdot_name->d_fsdata = dataonstack;
+ init_lh(dotdot_lh);
+
+ dotdot_coord = &dataonstack->dec.entry_coord;
+ coord_clear_iplug(dotdot_coord);
+
+ result = reiser4_find_entry(old_inode, dotdot_name,
+ dotdot_lh, ZNODE_WRITE_LOCK,
+ dotdot_entry);
+ if (result == 0) {
+ /* replace_name() decreases i_nlink on
+ * @old_dir */
+ result = replace_name(new_dir,
+ old_inode,
+ old_dir,
+ dotdot_coord, dotdot_lh);
+ } else
+ result = RETERR(-EIO);
+ done_lh(dotdot_lh);
+ }
+ }
+ reiser4_update_dir(new_dir);
+ reiser4_update_dir(old_dir);
+ reiser4_update_sd(old_inode);
+ if (result == 0) {
+ file_plugin *fplug;
+
+ if (new_inode != NULL) {
+ /* add safe-link for target file (in case we removed
+ * last reference to the poor fellow */
+ fplug = inode_file_plugin(new_inode);
+ if (new_inode->i_nlink == 0)
+ result = safe_link_add(new_inode, SAFE_UNLINK);
+ }
+ }
+ kfree(old_entry);
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+
+#if 0
+int reiser4_rename_common(struct inode *old_dir /* directory where @old
+ * is located */ ,
+ struct dentry *old_name /* old name */ ,
+ struct inode *new_dir /* directory where @new
+ * is located */ ,
+ struct dentry *new_name/* new name */)
+{
+ /* From `The Open Group Base Specifications Issue 6'
+
+ If either the old or new argument names a symbolic link, rename()
+ shall operate on the symbolic link itself, and shall not resolve
+ the last component of the argument. If the old argument and the new
+ argument resolve to the same existing file, rename() shall return
+ successfully and perform no other action.
+
+ [this is done by VFS: vfs_rename()]
+
+ If the old argument points to the pathname of a file that is not a
+ directory, the new argument shall not point to the pathname of a
+ directory.
+
+ [checked by VFS: vfs_rename->may_delete()]
+
+ If the link named by the new argument exists, it shall
+ be removed and old renamed to new. In this case, a link named new
+ shall remain visible to other processes throughout the renaming
+ operation and refer either to the file referred to by new or old
+ before the operation began.
+
+ [we should assure this]
+
+ Write access permission is required for
+ both the directory containing old and the directory containing new.
+
+ [checked by VFS: vfs_rename->may_delete(), may_create()]
+
+ If the old argument points to the pathname of a directory, the new
+ argument shall not point to the pathname of a file that is not a
+ directory.
+
+ [checked by VFS: vfs_rename->may_delete()]
+
+ If the directory named by the new argument exists, it
+ shall be removed and old renamed to new. In this case, a link named
+ new shall exist throughout the renaming operation and shall refer
+ either to the directory referred to by new or old before the
+ operation began.
+
+ [we should assure this]
+
+ If new names an existing directory, it shall be
+ required to be an empty directory.
+
+ [we should check this]
+
+ If the old argument points to a pathname of a symbolic link, the
+ symbolic link shall be renamed. If the new argument points to a
+ pathname of a symbolic link, the symbolic link shall be removed.
+
+ The new pathname shall not contain a path prefix that names
+ old. Write access permission is required for the directory
+ containing old and the directory containing new. If the old
+ argument points to the pathname of a directory, write access
+ permission may be required for the directory named by old, and, if
+ it exists, the directory named by new.
+
+ [checked by VFS: vfs_rename(), vfs_rename_dir()]
+
+ If the link named by the new argument exists and the file's link
+ count becomes 0 when it is removed and no process has the file
+ open, the space occupied by the file shall be freed and the file
+ shall no longer be accessible. If one or more processes have the
+ file open when the last link is removed, the link shall be removed
+ before rename() returns, but the removal of the file contents shall
+ be postponed until all references to the file are closed.
+
+ [iput() handles this, but we can do this manually, a la
+ reiser4_unlink()]
+
+ Upon successful completion, rename() shall mark for update the
+ st_ctime and st_mtime fields of the parent directory of each file.
+
+ [N/A]
+
+ */
+ reiser4_context *ctx;
+ int result;
+ int is_dir; /* is @old_name directory */
+ struct inode *old_inode;
+ struct inode *new_inode;
+ reiser4_dir_entry_desc old_entry;
+ reiser4_dir_entry_desc new_entry;
+ coord_t *new_coord;
+ struct reiser4_dentry_fsdata *new_fsdata;
+ lock_handle new_lh;
+ dir_plugin *dplug;
+ file_plugin *fplug;
+
+ ctx = reiser4_init_context(old_dir->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ assert("nikita-2318", old_dir != NULL);
+ assert("nikita-2319", new_dir != NULL);
+ assert("nikita-2320", old_name != NULL);
+ assert("nikita-2321", new_name != NULL);
+
+ old_inode = old_name->d_inode;
+ new_inode = new_name->d_inode;
+
+ dplug = inode_dir_plugin(old_dir);
+ fplug = NULL;
+
+ new_fsdata = reiser4_get_dentry_fsdata(new_name);
+ if (IS_ERR(new_fsdata)) {
+ result = PTR_ERR(new_fsdata);
+ goto exit;
+ }
+
+ new_coord = &new_fsdata->dec.entry_coord;
+ coord_clear_iplug(new_coord);
+
+ is_dir = S_ISDIR(old_inode->i_mode);
+
+ assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
+
+ /* if target is existing directory and it's not empty---return error.
+
+ This check is done specifically, because is_dir_empty() requires
+ tree traversal and have to be done before locks are taken.
+ */
+ if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
+ return RETERR(-ENOTEMPTY);
+
+ result = can_rename(old_dir, old_inode, new_dir, new_inode);
+ if (result != 0)
+ goto exit;
+
+ result = hashed_rename_estimate_and_grab(old_dir, old_name,
+ new_dir, new_name);
+ if (result != 0)
+ goto exit;
+
+ init_lh(&new_lh);
+
+ /* find entry for @new_name */
+ result = reiser4_find_entry(new_dir, new_name, &new_lh,
+ ZNODE_WRITE_LOCK, &new_entry);
+
+ if (IS_CBKERR(result)) {
+ done_lh(&new_lh);
+ goto exit;
+ }
+
+ reiser4_seal_done(&new_fsdata->dec.entry_seal);
+
+ /* add or replace name for @old_inode as @new_name */
+ if (new_inode != NULL) {
+ /* target (@new_name) exists. */
+ /* Not clear what to do with objects that are
+ both directories and files at the same time. */
+ if (result == CBK_COORD_FOUND) {
+ result = replace_name(old_inode,
+ new_dir,
+ new_inode, new_coord, &new_lh);
+ if (result == 0)
+ fplug = inode_file_plugin(new_inode);
+ } else if (result == CBK_COORD_NOTFOUND) {
+ /* VFS told us that @new_name is bound to existing
+ inode, but we failed to find directory entry. */
+ warning("nikita-2324", "Target not found");
+ result = RETERR(-ENOENT);
+ }
+ } else {
+ /* target (@new_name) doesn't exists. */
+ if (result == CBK_COORD_NOTFOUND)
+ result = add_name(old_inode,
+ new_dir,
+ new_name, new_coord, &new_lh, is_dir);
+ else if (result == CBK_COORD_FOUND) {
+ /* VFS told us that @new_name is "negative" dentry,
+ but we found directory entry. */
+ warning("nikita-2331", "Target found unexpectedly");
+ result = RETERR(-EIO);
+ }
+ }
+
+ assert("nikita-3462", ergo(result == 0,
+ old_inode->i_nlink >= 2 + !!is_dir));
+
+ /* We are done with all modifications to the @new_dir, release lock on
+ node. */
+ done_lh(&new_lh);
+
+ if (fplug != NULL) {
+ /* detach @new_inode from name-space */
+ result = fplug->detach(new_inode, new_dir);
+ if (result != 0)
+ warning("nikita-2330", "Cannot detach %lli: %i. %s",
+ (unsigned long long)get_inode_oid(new_inode),
+ result, possible_leak);
+ }
+
+ if (new_inode != NULL)
+ reiser4_update_sd(new_inode);
+
+ if (result == 0) {
+ memset(&old_entry, 0, sizeof old_entry);
+ old_entry.obj = old_inode;
+
+ dplug->build_entry_key(old_dir,
+ &old_name->d_name, &old_entry.key);
+
+ /* At this stage new name was introduced for
+ @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
+ counters were updated.
+
+ We want to remove @old_name now. If @old_inode wasn't
+ directory this is simple.
+ */
+ result = dplug->rem_entry(old_dir, old_name, &old_entry);
+ /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
+ if (result != 0 && result != -ENOMEM) {
+ warning("nikita-2335",
+ "Cannot remove old name: %i", result);
+ } else {
+ result = reiser4_del_nlink(old_inode, old_dir, 0);
+ if (result != 0 && result != -ENOMEM) {
+ warning("nikita-2337",
+ "Cannot drop link on old: %i", result);
+ }
+ }
+
+ if (result == 0 && is_dir) {
+ /* @old_inode is directory. We also have to update
+ dotdot entry. */
+ coord_t *dotdot_coord;
+ lock_handle dotdot_lh;
+ struct dentry dotdot_name;
+ reiser4_dir_entry_desc dotdot_entry;
+ struct reiser4_dentry_fsdata dataonstack;
+ struct reiser4_dentry_fsdata *fsdata;
+
+ memset(&dataonstack, 0, sizeof dataonstack);
+ memset(&dotdot_entry, 0, sizeof dotdot_entry);
+ dotdot_entry.obj = old_dir;
+ memset(&dotdot_name, 0, sizeof dotdot_name);
+ dotdot_name.d_name.name = "..";
+ dotdot_name.d_name.len = 2;
+ /*
+ * allocate ->d_fsdata on the stack to avoid using
+ * reiser4_get_dentry_fsdata(). Locking is not needed,
+ * because dentry is private to the current thread.
+ */
+ dotdot_name.d_fsdata = &dataonstack;
+ init_lh(&dotdot_lh);
+
+ fsdata = &dataonstack;
+ dotdot_coord = &fsdata->dec.entry_coord;
+ coord_clear_iplug(dotdot_coord);
+
+ result = reiser4_find_entry(old_inode,
+ &dotdot_name,
+ &dotdot_lh,
+ ZNODE_WRITE_LOCK,
+ &dotdot_entry);
+ if (result == 0) {
+ /* replace_name() decreases i_nlink on
+ * @old_dir */
+ result = replace_name(new_dir,
+ old_inode,
+ old_dir,
+ dotdot_coord, &dotdot_lh);
+ } else
+ result = RETERR(-EIO);
+ done_lh(&dotdot_lh);
+ }
+ }
+ reiser4_update_dir(new_dir);
+ reiser4_update_dir(old_dir);
+ reiser4_update_sd(old_inode);
+ if (result == 0) {
+ file_plugin *fplug;
+
+ if (new_inode != NULL) {
+ /* add safe-link for target file (in case we removed
+ * last reference to the poor fellow */
+ fplug = inode_file_plugin(new_inode);
+ if (new_inode->i_nlink == 0)
+ result = safe_link_add(new_inode, SAFE_UNLINK);
+ }
+ }
+exit:
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ return result;
+}
+#endif
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/acl.h linux-5.10.2/fs/reiser4/plugin/item/acl.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/acl.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/acl.h 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,66 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Directory entry. */
+
+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+
+#include <linux/fs.h>
+#include <linux/dcache.h> /* for struct dentry */
+
+typedef struct directory_entry_format {
+ /* key of object stat-data. It's not necessary to store whole
+ key here, because it's always key of stat-data, so minor
+ packing locality and offset can be omitted here. But this
+ relies on particular key allocation scheme for stat-data, so,
+ for extensibility sake, whole key can be stored here.
+
+ We store key as array of bytes, because we don't want 8-byte
+ alignment of dir entries.
+ */
+ obj_key_id id;
+ /* file name. Null terminated string. */
+ d8 name[0];
+} directory_entry_format;
+
+void print_de(const char *prefix, coord_t * coord);
+int extract_key_de(const coord_t * coord, reiser4_key * key);
+int update_key_de(const coord_t * coord, const reiser4_key * key,
+ lock_handle * lh);
+char *extract_name_de(const coord_t * coord, char *buf);
+unsigned extract_file_type_de(const coord_t * coord);
+int add_entry_de(struct inode *dir, coord_t * coord,
+ lock_handle * lh, const struct dentry *name,
+ reiser4_dir_entry_desc * entry);
+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
+ lock_handle * lh, reiser4_dir_entry_desc * entry);
+int max_name_len_de(const struct inode *dir);
+
+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
+
+char *extract_dent_name(const coord_t * coord,
+ directory_entry_format * dent, char *buf);
+
+#if REISER4_LARGE_KEY
+#define DE_NAME_BUF_LEN (24)
+#else
+#define DE_NAME_BUF_LEN (16)
+#endif
+
+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/blackbox.c linux-5.10.2/fs/reiser4/plugin/item/blackbox.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/blackbox.c 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,142 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Black box item implementation */
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../coord.h"
+#include "../../tree.h"
+#include "../../lock.h"
+
+#include "blackbox.h"
+#include "item.h"
+#include "../plugin.h"
+
+int
+store_black_box(reiser4_tree * tree,
+ const reiser4_key * key, void *data, int length)
+{
+ int result;
+ reiser4_item_data idata;
+ coord_t coord;
+ lock_handle lh;
+
+ memset(&idata, 0, sizeof idata);
+
+ idata.data = data;
+ idata.user = 0;
+ idata.length = length;
+ idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
+
+ init_lh(&lh);
+ result = insert_by_key(tree, key,
+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
+
+ assert("nikita-3413",
+ ergo(result == 0,
+ WITH_COORD(&coord,
+ item_length_by_coord(&coord) == length)));
+
+ done_lh(&lh);
+ return result;
+}
+
+int
+load_black_box(reiser4_tree * tree,
+ reiser4_key * key, void *data, int length, int exact)
+{
+ int result;
+ coord_t coord;
+ lock_handle lh;
+
+ init_lh(&lh);
+ result = coord_by_key(tree, key,
+ &coord, &lh, ZNODE_READ_LOCK,
+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
+
+ if (result == 0) {
+ int ilen;
+
+ result = zload(coord.node);
+ if (result == 0) {
+ ilen = item_length_by_coord(&coord);
+ if (ilen <= length) {
+ memcpy(data, item_body_by_coord(&coord), ilen);
+ unit_key_by_coord(&coord, key);
+ } else if (exact) {
+ /*
+ * item is larger than buffer provided by the
+ * user. Only issue a warning if @exact is
+ * set. If @exact is false, we are iterating
+ * over all safe-links and here we are reaching
+ * the end of the iteration.
+ */
+ warning("nikita-3415",
+ "Wrong black box length: %i > %i",
+ ilen, length);
+ result = RETERR(-EIO);
+ }
+ zrelse(coord.node);
+ }
+ }
+
+ done_lh(&lh);
+ return result;
+
+}
+
+int
+update_black_box(reiser4_tree * tree,
+ const reiser4_key * key, void *data, int length)
+{
+ int result;
+ coord_t coord;
+ lock_handle lh;
+
+ init_lh(&lh);
+ result = coord_by_key(tree, key,
+ &coord, &lh, ZNODE_READ_LOCK,
+ FIND_EXACT,
+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
+ if (result == 0) {
+ int ilen;
+
+ result = zload(coord.node);
+ if (result == 0) {
+ ilen = item_length_by_coord(&coord);
+ if (length <= ilen) {
+ memcpy(item_body_by_coord(&coord), data,
+ length);
+ } else {
+ warning("nikita-3437",
+ "Wrong black box length: %i < %i",
+ ilen, length);
+ result = RETERR(-EIO);
+ }
+ zrelse(coord.node);
+ }
+ }
+
+ done_lh(&lh);
+ return result;
+
+}
+
+int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
+{
+ return reiser4_cut_tree(tree, key, key, NULL, 1);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/blackbox.h linux-5.10.2/fs/reiser4/plugin/item/blackbox.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/blackbox.h 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,33 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* "Black box" entry to fixed-width contain user supplied data */
+
+#if !defined( __FS_REISER4_BLACK_BOX_H__ )
+#define __FS_REISER4_BLACK_BOX_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+
+extern int store_black_box(reiser4_tree * tree,
+ const reiser4_key * key, void *data, int length);
+extern int load_black_box(reiser4_tree * tree,
+ reiser4_key * key, void *data, int length, int exact);
+extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
+extern int update_black_box(reiser4_tree * tree,
+ const reiser4_key * key, void *data, int length);
+
+/* __FS_REISER4_BLACK_BOX_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/brick_symbol.c linux-5.10.2/fs/reiser4/plugin/item/brick_symbol.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/brick_symbol.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/brick_symbol.c 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,175 @@
+/*
+ Copyright (c) 2019-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../coord.h"
+#include "../../tree.h"
+#include "../../lock.h"
+#include "../../super.h"
+
+#include "brick_symbol.h"
+#include "item.h"
+#include "../plugin.h"
+
+int store_brick_symbol(const reiser4_key *key, void *data, int length)
+{
+ int ret;
+ reiser4_item_data idata;
+ coord_t coord;
+ lock_handle lh;
+
+ memset(&idata, 0, sizeof idata);
+
+ idata.data = data;
+ idata.user = 0;
+ idata.length = length;
+ idata.iplug = item_plugin_by_id(BRICK_SYMBOL_ID);
+
+ init_lh(&lh);
+ ret = insert_by_key(meta_subvol_tree(), key,
+ &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
+ assert("edward-2296",
+ ergo(ret == 0,
+ WITH_COORD(&coord,
+ item_length_by_coord(&coord) == length)));
+ done_lh(&lh);
+ return ret;
+}
+
+int load_brick_symbol(const reiser4_key *key, void *data,
+ int length, int exact)
+{
+ int ret;
+ coord_t coord;
+ lock_handle lh;
+
+ init_lh(&lh);
+ ret = coord_by_key(meta_subvol_tree(), key,
+ &coord, &lh, ZNODE_READ_LOCK,
+ exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
+ LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
+ if (ret == 0) {
+ ret = zload(coord.node);
+ if (ret == 0) {
+ int ilen = item_length_by_coord(&coord);
+ if (ilen == length)
+ memcpy(data, item_body_by_coord(&coord), ilen);
+ else {
+ warning("edward-2297",
+ "Wrong brick symbol length: %i != %i",
+ ilen, length);
+ ret = RETERR(-EIO);
+ }
+ }
+ zrelse(coord.node);
+ }
+ done_lh(&lh);
+ return ret;
+}
+
+int kill_brick_symbol(const reiser4_key *key)
+{
+ return reiser4_cut_tree(meta_subvol_tree(), key, key, NULL, 1);
+}
+
+typedef struct brick_symbol {
+ d64 id; /* internal brick ID, AKA index in the array of slots */
+} brick_symbol_t;
+
+
+static oid_t brick_symbol_locality(void)
+{
+ return get_key_objectid(get_meta_subvol()->df_plug->
+ root_dir_key(NULL)) + 2;
+}
+
+/**
+ * convert 1-st 8 bytes of brick's UUID to a 64-bit number
+ */
+static u64 brick_symbol_fulloid(reiser4_subvol *subv)
+{
+ return le64_to_cpu(get_unaligned((u64 *)subv->uuid));
+}
+
+/**
+ * convert 2-nd 8 bytes of brick's UUID to a 64-bit number
+ */
+static u64 brick_symbol_offset(reiser4_subvol *subv)
+{
+ return le64_to_cpu(get_unaligned((u64 *)&subv->uuid[8]));
+}
+
+/*
+ Construct a key for brick symbol item. Key has the following format:
+
+| 60 | 4 | 64 | 64 | 64 |
++---------------+---+------------------+-------------------+-------------------+
+| locality | 0 | 0 | 1-st part of uuid | 2-nd part of uuid |
++---------------+---+------------------+-------------------+-------------------+
+| | | | |
+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
+
+ This is in large keys format. In small keys format second 8 byte chunk is
+ out. Locality is a constant returned by safe_link_locality().
+ UUID is external ID of the brick for which we construct the key.
+*/
+
+static reiser4_key *build_brick_symbol_key(reiser4_key *key,
+ reiser4_subvol *subv)
+{
+ reiser4_key_init(key);
+ set_key_locality(key, brick_symbol_locality());
+ set_key_fulloid(key, brick_symbol_fulloid(subv));
+ set_key_offset(key, brick_symbol_offset(subv));
+ return key;
+}
+
+int brick_symbol_add(reiser4_subvol *subv)
+{
+ reiser4_key key;
+ brick_symbol_t bs;
+
+ put_unaligned(cpu_to_le64(subv->id), &bs.id);
+ build_brick_symbol_key(&key, subv);
+
+ return store_brick_symbol(&key, &bs, sizeof bs);
+}
+
+int brick_symbol_del(reiser4_subvol *subv)
+{
+ reiser4_key key;
+
+ return kill_brick_symbol(build_brick_symbol_key(&key, subv));
+}
+
+int brick_identify(reiser4_subvol *subv)
+{
+ int ret;
+ reiser4_key key;
+ brick_symbol_t bs;
+
+ ret = load_brick_symbol(build_brick_symbol_key(&key, subv),
+ &bs, sizeof bs, 1 /* exact */);
+ if (ret)
+ return 0;
+ return bs.id == subv->id;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/brick_symbol.h linux-5.10.2/fs/reiser4/plugin/item/brick_symbol.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/brick_symbol.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/brick_symbol.h 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,30 @@
+/* "Brick symbol" contains internal and extermal brick IDs */
+
+#if !defined( __FS_REISER4_BRICK_SYMBOL_H__ )
+#define __FS_REISER4_BRICK_SYMBOL_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+
+extern int store_brick_symbol(const reiser4_key *key, void *data, int len);
+extern int load_brick_symbol(const reiser4_key *key, void *data,
+ int len, int exact);
+extern int kill_brick_symbol(const reiser4_key *key);
+extern int brick_symbol_add(reiser4_subvol *subv);
+extern int brick_symbol_del(reiser4_subvol *subv);
+extern int brick_identify(reiser4_subvol *subv);
+
+/* __FS_REISER4_BRICK_SYMBOL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/cde.c linux-5.10.2/fs/reiser4/plugin/item/cde.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/cde.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/cde.c 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,1005 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Directory entry implementation */
+
+/* DESCRIPTION:
+
+ This is "compound" directory item plugin implementation. This directory
+ item type is compound (as opposed to the "simple directory item" in
+ fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
+ entries.
+
+ The reason behind this decision is disk space efficiency: all directory
+ entries inside the same directory have identical fragment in their
+ keys. This, of course, depends on key assignment policy. In our default key
+ assignment policy, all directory entries have the same locality which is
+ equal to the object id of their directory.
+
+ Composing directory item out of several directory entries for the same
+ directory allows us to store said key fragment only once. That is, this is
+ some ad hoc form of key compression (stem compression) that is implemented
+ here, because general key compression is not supposed to be implemented in
+ v4.0.
+
+ Another decision that was made regarding all directory item plugins, is
+ that they will store entry keys unaligned. This is for that sake of disk
+ space efficiency again.
+
+ In should be noted, that storing keys unaligned increases CPU consumption,
+ at least on some architectures.
+
+ Internal on-disk structure of the compound directory item is the following:
+
+ HEADER cde_item_format. Here number of entries is stored.
+ ENTRY_HEADER_0 cde_unit_header. Here part of entry key and
+ ENTRY_HEADER_1 offset of entry body are stored.
+ ENTRY_HEADER_2 (basically two last parts of key)
+ ...
+ ENTRY_HEADER_N
+ ENTRY_BODY_0 directory_entry_format. Here part of stat data key and
+ ENTRY_BODY_1 NUL-terminated name are stored.
+ ENTRY_BODY_2 (part of statadta key in the
+ sence that since all SDs have
+ zero offset, this offset is not
+ stored on disk).
+ ...
+ ENTRY_BODY_N
+
+ When it comes to the balancing, each directory entry in compound directory
+ item is unit, that is, something that can be cut from one item and pasted
+ into another item of the same type. Handling of unit cut and paste is major
+ reason for the complexity of code below.
+
+*/
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "sde.h"
+#include "cde.h"
+#include "item.h"
+#include "../node/node.h"
+#include "../plugin.h"
+#include "../../znode.h"
+#include "../../carry.h"
+#include "../../tree.h"
+#include "../../inode.h"
+
+#include <linux/fs.h> /* for struct inode */
+#include <linux/dcache.h> /* for struct dentry */
+
+#if 0
+#define CHECKME(coord) \
+({ \
+ const char *message; \
+ coord_t dup; \
+ \
+ coord_dup_nocheck(&dup, (coord)); \
+ dup.unit_pos = 0; \
+ assert("nikita-2871", cde_check(&dup, &message) == 0); \
+})
+#else
+#define CHECKME(coord) noop
+#endif
+
+static_assert(REISER4_SEQ_SEARCH_BREAK > 2);
+
+/* return body of compound directory item at @coord */
+static inline cde_item_format *formatted_at(const coord_t * coord)
+{
+ assert("nikita-1282", coord != NULL);
+ return item_body_by_coord(coord);
+}
+
+/* return entry header at @coord */
+static inline cde_unit_header *header_at(const coord_t *
+ coord /* coord of item */ ,
+ int idx /* index of unit */ )
+{
+ assert("nikita-1283", coord != NULL);
+ return &formatted_at(coord)->entry[idx];
+}
+
+/* return number of units in compound directory item at @coord */
+static int units(const coord_t * coord /* coord of item */ )
+{
+ return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
+}
+
+/* return offset of the body of @idx-th entry in @coord */
+static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
+ int idx /* index of unit */ )
+{
+ if (idx < units(coord))
+ return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
+ else if (idx == units(coord))
+ return item_length_by_coord(coord);
+ else
+ impossible("nikita-1308", "Wrong idx");
+ return 0;
+}
+
+/* set offset of the body of @idx-th entry in @coord */
+static void set_offset(const coord_t * coord /* coord of item */ ,
+ int idx /* index of unit */ ,
+ unsigned int offset /* new offset */ )
+{
+ put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
+}
+
+static void adj_offset(const coord_t * coord /* coord of item */ ,
+ int idx /* index of unit */ ,
+ int delta /* offset change */ )
+{
+ d16 *doffset;
+ __u16 offset;
+
+ doffset = &header_at(coord, idx)->offset;
+ offset = le16_to_cpu(get_unaligned(doffset));
+ offset += delta;
+ put_unaligned(cpu_to_le16((__u16) offset), doffset);
+}
+
+/* return pointer to @offset-th byte from the beginning of @coord */
+static char *address(const coord_t * coord /* coord of item */ ,
+ int offset)
+{
+ return ((char *)item_body_by_coord(coord)) + offset;
+}
+
+/* return pointer to the body of @idx-th entry in @coord */
+static directory_entry_format *entry_at(const coord_t * coord /* coord of
+ * item */ ,
+ int idx /* index of unit */ )
+{
+ return (directory_entry_format *) address(coord,
+ (int)offset_of(coord, idx));
+}
+
+/* return number of unit referenced by @coord */
+static int idx_of(const coord_t * coord /* coord of item */ )
+{
+ assert("nikita-1285", coord != NULL);
+ return coord->unit_pos;
+}
+
+/* find position where entry with @entry_key would be inserted into @coord */
+static int find(const coord_t * coord /* coord of item */ ,
+ const reiser4_key * entry_key /* key to look for */ ,
+ cmp_t * last /* result of last comparison */ )
+{
+ int entries;
+
+ int left;
+ int right;
+
+ cde_unit_header *header;
+
+ assert("nikita-1295", coord != NULL);
+ assert("nikita-1296", entry_key != NULL);
+ assert("nikita-1297", last != NULL);
+
+ entries = units(coord);
+ left = 0;
+ right = entries - 1;
+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
+ int median;
+
+ median = (left + right) >> 1;
+
+ header = header_at(coord, median);
+ *last = de_id_key_cmp(&header->hash, entry_key);
+ switch (*last) {
+ case LESS_THAN:
+ left = median;
+ break;
+ case GREATER_THAN:
+ right = median;
+ break;
+ case EQUAL_TO:{
+ do {
+ median--;
+ header--;
+ } while (median >= 0 &&
+ de_id_key_cmp(&header->hash,
+ entry_key) == EQUAL_TO);
+ return median + 1;
+ }
+ }
+ }
+ header = header_at(coord, left);
+ for (; left < entries; ++left, ++header) {
+ prefetch(header + 1);
+ *last = de_id_key_cmp(&header->hash, entry_key);
+ if (*last != LESS_THAN)
+ break;
+ }
+ if (left < entries)
+ return left;
+ else
+ return RETERR(-ENOENT);
+
+}
+
+/* expand @coord as to accommodate for insertion of @no new entries starting
+ from @pos, with total bodies size @size. */
+static int expand_item(const coord_t * coord /* coord of item */ ,
+ int pos /* unit position */ , int no /* number of new
+ * units*/ ,
+ int size /* total size of new units' data */ ,
+ unsigned int data_size /* free space already reserved
+ * in the item for insertion */ )
+{
+ int entries;
+ cde_unit_header *header;
+ char *dent;
+ int i;
+
+ assert("nikita-1310", coord != NULL);
+ assert("nikita-1311", pos >= 0);
+ assert("nikita-1312", no > 0);
+ assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
+ assert("nikita-1343",
+ item_length_by_coord(coord) >=
+ (int)(size + data_size + no * sizeof *header));
+
+ entries = units(coord);
+
+ if (pos == entries)
+ dent = address(coord, size);
+ else
+ dent = (char *)entry_at(coord, pos);
+ /* place where new header will be in */
+ header = header_at(coord, pos);
+ /* free space for new entry headers */
+ memmove(header + no, header,
+ (unsigned)(address(coord, size) - (char *)header));
+ /* if adding to the end initialise first new header */
+ if (pos == entries) {
+ set_offset(coord, pos, (unsigned)size);
+ }
+
+ /* adjust entry pointer and size */
+ dent = dent + no * sizeof *header;
+ size += no * sizeof *header;
+ /* free space for new entries */
+ memmove(dent + data_size, dent,
+ (unsigned)(address(coord, size) - dent));
+
+ /* increase counter */
+ entries += no;
+ put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
+
+ /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
+ bytes. */
+ for (i = 0; i <= pos; ++i)
+ adj_offset(coord, i, no * sizeof *header);
+ /* [ pos + no ... +\infty ) entries were shifted by ( no *
+ sizeof *header + data_size ) bytes */
+ for (i = pos + no; i < entries; ++i)
+ adj_offset(coord, i, no * sizeof *header + data_size);
+ return 0;
+}
+
+/* insert new @entry into item */
+static int expand(const coord_t * coord /* coord of item */ ,
+ struct cde_entry * entry /* entry to insert */ ,
+ int len /* length of @entry data */ ,
+ int *pos /* position to insert */ ,
+ reiser4_dir_entry_desc * dir_entry /* parameters for new
+ * entry */ )
+{
+ cmp_t cmp_res;
+ int datasize;
+
+ *pos = find(coord, &dir_entry->key, &cmp_res);
+ if (*pos < 0)
+ *pos = units(coord);
+
+ datasize = sizeof(directory_entry_format);
+ if (is_longname(entry->name->name, entry->name->len))
+ datasize += entry->name->len + 1;
+
+ expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
+ datasize);
+ return 0;
+}
+
+/* paste body of @entry into item */
+static int paste_entry(const coord_t * coord /* coord of item */ ,
+ struct cde_entry * entry /* new entry */ ,
+ int pos /* position to insert */ ,
+ reiser4_dir_entry_desc * dir_entry /* parameters for
+ * new entry */ )
+{
+ cde_unit_header *header;
+ directory_entry_format *dent;
+ const char *name;
+ int len;
+
+ header = header_at(coord, pos);
+ dent = entry_at(coord, pos);
+
+ build_de_id_by_key(&dir_entry->key, &header->hash);
+ build_inode_key_id(entry->obj, &dent->id);
+ /* AUDIT unsafe strcpy() operation! It should be replaced with
+ much less CPU hungry
+ memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
+
+ Also a more major thing is that there should be a way to figure out
+ amount of space in dent -> name and be able to check that we are
+ not going to overwrite more than we supposed to */
+ name = entry->name->name;
+ len = entry->name->len;
+ if (is_longname(name, len)) {
+ strcpy((unsigned char *)dent->name, name);
+ put_unaligned(0, &dent->name[len]);
+ }
+ return 0;
+}
+
+/* estimate how much space is necessary in item to insert/paste set of entries
+ described in @data. */
+int estimate_cde(const coord_t * coord /* coord of item */ ,
+ const reiser4_item_data * data /* parameters for new item */ )
+{
+ struct cde_entry_data *e;
+ int result;
+ int i;
+
+ e = (struct cde_entry_data *) data->data;
+
+ assert("nikita-1288", e != NULL);
+ assert("nikita-1289", e->num_of_entries >= 0);
+
+ if (coord == NULL)
+ /* insert */
+ result = sizeof(cde_item_format);
+ else
+ /* paste */
+ result = 0;
+
+ result += e->num_of_entries *
+ (sizeof(cde_unit_header) + sizeof(directory_entry_format));
+ for (i = 0; i < e->num_of_entries; ++i) {
+ const char *name;
+ int len;
+
+ name = e->entry[i].name->name;
+ len = e->entry[i].name->len;
+ assert("nikita-2054", strlen(name) == len);
+ if (is_longname(name, len))
+ result += len + 1;
+ }
+ ((reiser4_item_data *) data)->length = result;
+ return result;
+}
+
+/* ->nr_units() method for this item plugin. */
+pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
+{
+ return units(coord);
+}
+
+/* ->unit_key() method for this item plugin. */
+reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
+ reiser4_key * key /* resulting key */ )
+{
+ assert("nikita-1452", coord != NULL);
+ assert("nikita-1345", idx_of(coord) < units(coord));
+ assert("nikita-1346", key != NULL);
+
+ item_key_by_coord(coord, key);
+ extract_key_from_de_id(extract_dir_id_from_key(key),
+ &header_at(coord, idx_of(coord))->hash, key);
+ return key;
+}
+
+/* mergeable_cde(): implementation of ->mergeable() item method.
+
+ Two directory items are mergeable iff they are from the same
+ directory. That simple.
+
+*/
+int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
+ const coord_t * p2 /* coord of second item */ )
+{
+ reiser4_key k1;
+ reiser4_key k2;
+
+ assert("nikita-1339", p1 != NULL);
+ assert("nikita-1340", p2 != NULL);
+
+ return
+ (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
+ (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
+ extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
+
+}
+
+/* ->max_key_inside() method for this item plugin. */
+reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
+ reiser4_key * result /* resulting key */ )
+{
+ assert("nikita-1342", coord != NULL);
+
+ item_key_by_coord(coord, result);
+ set_key_ordering(result, get_key_ordering(reiser4_max_key()));
+ set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
+ set_key_offset(result, get_key_offset(reiser4_max_key()));
+ return result;
+}
+
+/* @data contains data which are to be put into tree */
+int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
+ const reiser4_key * key /* key to check */ ,
+ const reiser4_item_data * data /* parameters of new
+ * item/unit being
+ * created */ )
+{
+ reiser4_key item_key;
+
+ /* FIXME-VS: do not rely on anything but iplug field of @data. Only
+ data->iplug is initialized */
+ assert("vs-457", data && data->iplug);
+/* assert( "vs-553", data -> user == 0 );*/
+ item_key_by_coord(coord, &item_key);
+
+ return (item_plugin_by_coord(coord) == data->iplug) &&
+ (extract_dir_id_from_key(&item_key) ==
+ extract_dir_id_from_key(key));
+}
+
+#if REISER4_DEBUG
+/* cde_check ->check() method for compressed directory items
+
+ used for debugging, every item should have here the most complete
+ possible check of the consistency of the item that the inventor can
+ construct
+*/
+int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
+ const char **error /* where to store error message */)
+{
+ int i;
+ int result;
+ char *item_start;
+ char *item_end;
+ reiser4_key key;
+
+ coord_t c;
+
+ assert("nikita-1357", coord != NULL);
+ assert("nikita-1358", error != NULL);
+
+ if (!ergo(coord->item_pos != 0,
+ is_dot_key(item_key_by_coord(coord, &key)))) {
+ *error = "CDE doesn't start with dot";
+ return -1;
+ }
+ item_start = item_body_by_coord(coord);
+ item_end = item_start + item_length_by_coord(coord);
+
+ coord_dup(&c, coord);
+ result = 0;
+ for (i = 0; i < units(coord); ++i) {
+ directory_entry_format *entry;
+
+ if ((char *)(header_at(coord, i) + 1) >
+ item_end - units(coord) * sizeof *entry) {
+ *error = "CDE header is out of bounds";
+ result = -1;
+ break;
+ }
+ entry = entry_at(coord, i);
+ if ((char *)entry < item_start + sizeof(cde_item_format)) {
+ *error = "CDE header is too low";
+ result = -1;
+ break;
+ }
+ if ((char *)(entry + 1) > item_end) {
+ *error = "CDE header is too high";
+ result = -1;
+ break;
+ }
+ }
+
+ return result;
+}
+#endif
+
+/* ->init() method for this item plugin. */
+int init_cde(coord_t * coord /* coord of item */ ,
+ coord_t * from UNUSED_ARG, reiser4_item_data * data /* structure used for insertion */
+ UNUSED_ARG)
+{
+ put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
+ return 0;
+}
+
+/* ->lookup() method for this item plugin. */
+lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
+ lookup_bias bias /* search bias */ ,
+ coord_t * coord /* coord of item to lookup in */ )
+{
+ cmp_t last_comp;
+ int pos;
+
+ reiser4_key utmost_key;
+
+ assert("nikita-1293", coord != NULL);
+ assert("nikita-1294", key != NULL);
+
+ CHECKME(coord);
+
+ if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
+ coord->unit_pos = 0;
+ coord->between = BEFORE_UNIT;
+ return CBK_COORD_NOTFOUND;
+ }
+ pos = find(coord, key, &last_comp);
+ if (pos >= 0) {
+ coord->unit_pos = (int)pos;
+ switch (last_comp) {
+ case EQUAL_TO:
+ coord->between = AT_UNIT;
+ return CBK_COORD_FOUND;
+ case GREATER_THAN:
+ coord->between = BEFORE_UNIT;
+ return RETERR(-ENOENT);
+ case LESS_THAN:
+ default:
+ impossible("nikita-1298", "Broken find");
+ return RETERR(-EIO);
+ }
+ } else {
+ coord->unit_pos = units(coord) - 1;
+ coord->between = AFTER_UNIT;
+ return (bias ==
+ FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
+ CBK_COORD_NOTFOUND;
+ }
+}
+
+/* ->paste() method for this item plugin. */
+int paste_cde(coord_t * coord /* coord of item */ ,
+ reiser4_item_data * data /* parameters of new unit being
+ * inserted */ ,
+ carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
+{
+ struct cde_entry_data *e;
+ int result;
+ int i;
+
+ CHECKME(coord);
+ e = (struct cde_entry_data *) data->data;
+
+ result = 0;
+ for (i = 0; i < e->num_of_entries; ++i) {
+ int pos;
+ int phantom_size;
+
+ phantom_size = data->length;
+ if (units(coord) == 0)
+ phantom_size -= sizeof(cde_item_format);
+
+ result =
+ expand(coord, e->entry + i, phantom_size, &pos, data->arg);
+ if (result != 0)
+ break;
+ result = paste_entry(coord, e->entry + i, pos, data->arg);
+ if (result != 0)
+ break;
+ }
+ CHECKME(coord);
+ return result;
+}
+
+/* amount of space occupied by all entries starting from @idx both headers and
+ bodies. */
+static unsigned int part_size(const coord_t * coord /* coord of item */ ,
+ int idx /* index of unit */ )
+{
+ assert("nikita-1299", coord != NULL);
+ assert("nikita-1300", idx < (int)units(coord));
+
+ return sizeof(cde_item_format) +
+ (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
+ idx + 1) -
+ offset_of(coord, 0);
+}
+
+/* how many but not more than @want units of @source can be merged with
+ item in @target node. If pend == append - we try to append last item
+ of @target by first units of @source. If pend == prepend - we try to
+ "prepend" first item in @target by last units of @source. @target
+ node has @free_space bytes of free space. Total size of those units
+ are returned via @size */
+int can_shift_cde(unsigned free_space /* free space in item */ ,
+ coord_t * coord /* coord of source item */ ,
+ znode * target /* target node */ ,
+ shift_direction pend /* shift direction */ ,
+ unsigned *size /* resulting number of shifted bytes */ ,
+ unsigned want /* maximal number of bytes to shift */ )
+{
+ int shift;
+
+ CHECKME(coord);
+ if (want == 0) {
+ *size = 0;
+ return 0;
+ }
+
+ /* pend == SHIFT_LEFT <==> shifting to the left */
+ if (pend == SHIFT_LEFT) {
+ for (shift = min((int)want - 1, units(coord)); shift >= 0;
+ --shift) {
+ *size = part_size(coord, shift);
+ if (target != NULL)
+ *size -= sizeof(cde_item_format);
+ if (*size <= free_space)
+ break;
+ }
+ shift = shift + 1;
+ } else {
+ int total_size;
+
+ assert("nikita-1301", pend == SHIFT_RIGHT);
+
+ total_size = item_length_by_coord(coord);
+ for (shift = units(coord) - want - 1; shift < units(coord) - 1;
+ ++shift) {
+ *size = total_size - part_size(coord, shift);
+ if (target == NULL)
+ *size += sizeof(cde_item_format);
+ if (*size <= free_space)
+ break;
+ }
+ shift = units(coord) - shift - 1;
+ }
+ if (shift == 0)
+ *size = 0;
+ CHECKME(coord);
+ return shift;
+}
+
+/* ->copy_units() method for this item plugin. */
+void copy_units_cde(coord_t * target /* coord of target item */ ,
+ coord_t * source /* coord of source item */ ,
+ unsigned from /* starting unit */ ,
+ unsigned count /* how many units to copy */ ,
+ shift_direction where_is_free_space /* shift direction */ ,
+ unsigned free_space /* free space in item */ )
+{
+ char *header_from;
+ char *header_to;
+
+ char *entry_from;
+ char *entry_to;
+
+ int pos_in_target;
+ int data_size;
+ int data_delta;
+ int i;
+
+ assert("nikita-1303", target != NULL);
+ assert("nikita-1304", source != NULL);
+ assert("nikita-1305", (int)from < units(source));
+ assert("nikita-1307", (int)(from + count) <= units(source));
+
+ if (where_is_free_space == SHIFT_LEFT) {
+ assert("nikita-1453", from == 0);
+ pos_in_target = units(target);
+ } else {
+ assert("nikita-1309", (int)(from + count) == units(source));
+ pos_in_target = 0;
+ memmove(item_body_by_coord(target),
+ (char *)item_body_by_coord(target) + free_space,
+ item_length_by_coord(target) - free_space);
+ }
+
+ CHECKME(target);
+ CHECKME(source);
+
+ /* expand @target */
+ data_size =
+ offset_of(source, (int)(from + count)) - offset_of(source,
+ (int)from);
+
+ if (units(target) == 0)
+ free_space -= sizeof(cde_item_format);
+
+ expand_item(target, pos_in_target, (int)count,
+ (int)(item_length_by_coord(target) - free_space),
+ (unsigned)data_size);
+
+ /* copy first @count units of @source into @target */
+ data_delta =
+ offset_of(target, pos_in_target) - offset_of(source, (int)from);
+
+ /* copy entries */
+ entry_from = (char *)entry_at(source, (int)from);
+ entry_to = (char *)entry_at(source, (int)(from + count));
+ memmove(entry_at(target, pos_in_target), entry_from,
+ (unsigned)(entry_to - entry_from));
+
+ /* copy headers */
+ header_from = (char *)header_at(source, (int)from);
+ header_to = (char *)header_at(source, (int)(from + count));
+ memmove(header_at(target, pos_in_target), header_from,
+ (unsigned)(header_to - header_from));
+
+ /* update offsets */
+ for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
+ adj_offset(target, i, data_delta);
+ CHECKME(target);
+ CHECKME(source);
+}
+
+/* ->cut_units() method for this item plugin. */
+int cut_units_cde(coord_t * coord /* coord of item */ ,
+ pos_in_node_t from /* start unit pos */ ,
+ pos_in_node_t to /* stop unit pos */ ,
+ struct carry_cut_data *cdata UNUSED_ARG,
+ reiser4_key * smallest_removed, reiser4_key * new_first)
+{
+ char *header_from;
+ char *header_to;
+
+ char *entry_from;
+ char *entry_to;
+
+ int size;
+ int entry_delta;
+ int header_delta;
+ int i;
+
+ unsigned count;
+
+ CHECKME(coord);
+
+ count = to - from + 1;
+
+ assert("nikita-1454", coord != NULL);
+ assert("nikita-1455", (int)(from + count) <= units(coord));
+
+ if (smallest_removed)
+ unit_key_by_coord(coord, smallest_removed);
+
+ if (new_first) {
+ coord_t next;
+
+ /* not everything is cut from item head */
+ assert("vs-1527", from == 0);
+ assert("vs-1528", to < units(coord) - 1);
+
+ coord_dup(&next, coord);
+ next.unit_pos++;
+ unit_key_by_coord(&next, new_first);
+ }
+
+ size = item_length_by_coord(coord);
+ if (count == (unsigned)units(coord)) {
+ return size;
+ }
+
+ header_from = (char *)header_at(coord, (int)from);
+ header_to = (char *)header_at(coord, (int)(from + count));
+
+ entry_from = (char *)entry_at(coord, (int)from);
+ entry_to = (char *)entry_at(coord, (int)(from + count));
+
+ /* move headers */
+ memmove(header_from, header_to,
+ (unsigned)(address(coord, size) - header_to));
+
+ header_delta = header_to - header_from;
+
+ entry_from -= header_delta;
+ entry_to -= header_delta;
+ size -= header_delta;
+
+ /* copy entries */
+ memmove(entry_from, entry_to,
+ (unsigned)(address(coord, size) - entry_to));
+
+ entry_delta = entry_to - entry_from;
+ size -= entry_delta;
+
+ /* update offsets */
+
+ for (i = 0; i < (int)from; ++i)
+ adj_offset(coord, i, -header_delta);
+
+ for (i = from; i < units(coord) - (int)count; ++i)
+ adj_offset(coord, i, -header_delta - entry_delta);
+
+ put_unaligned(cpu_to_le16((__u16) units(coord) - count),
+ &formatted_at(coord)->num_of_entries);
+
+ if (from == 0) {
+ /* entries from head was removed - move remaining to right */
+ memmove((char *)item_body_by_coord(coord) +
+ header_delta + entry_delta, item_body_by_coord(coord),
+ (unsigned)size);
+ if (REISER4_DEBUG)
+ memset(item_body_by_coord(coord), 0,
+ (unsigned)header_delta + entry_delta);
+ } else {
+ /* freed space is already at the end of item */
+ if (REISER4_DEBUG)
+ memset((char *)item_body_by_coord(coord) + size, 0,
+ (unsigned)header_delta + entry_delta);
+ }
+
+ return header_delta + entry_delta;
+}
+
+int kill_units_cde(coord_t * coord /* coord of item */ ,
+ pos_in_node_t from /* start unit pos */ ,
+ pos_in_node_t to /* stop unit pos */ ,
+ struct carry_kill_data *kdata UNUSED_ARG,
+ reiser4_key * smallest_removed, reiser4_key * new_first)
+{
+ return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
+}
+
+/* ->s.dir.extract_key() method for this item plugin. */
+int extract_key_cde(const coord_t * coord /* coord of item */ ,
+ reiser4_key * key /* resulting key */ )
+{
+ directory_entry_format *dent;
+
+ assert("nikita-1155", coord != NULL);
+ assert("nikita-1156", key != NULL);
+
+ dent = entry_at(coord, idx_of(coord));
+ return extract_key_from_id(&dent->id, key);
+}
+
+int
+update_key_cde(const coord_t * coord, const reiser4_key * key,
+ lock_handle * lh UNUSED_ARG)
+{
+ directory_entry_format *dent;
+ obj_key_id obj_id;
+ int result;
+
+ assert("nikita-2344", coord != NULL);
+ assert("nikita-2345", key != NULL);
+
+ dent = entry_at(coord, idx_of(coord));
+ result = build_obj_key_id(key, &obj_id);
+ if (result == 0) {
+ dent->id = obj_id;
+ znode_make_dirty(coord->node);
+ }
+ return 0;
+}
+
+/* ->s.dir.extract_name() method for this item plugin. */
+char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
+{
+ directory_entry_format *dent;
+
+ assert("nikita-1157", coord != NULL);
+
+ dent = entry_at(coord, idx_of(coord));
+ return extract_dent_name(coord, dent, buf);
+}
+
+static int cde_bytes(int pasting, const reiser4_item_data * data)
+{
+ int result;
+
+ result = data->length;
+ if (!pasting)
+ result -= sizeof(cde_item_format);
+ return result;
+}
+
+/* ->s.dir.add_entry() method for this item plugin */
+int add_entry_cde(struct inode *dir /* directory object */ ,
+ coord_t * coord /* coord of item */ ,
+ lock_handle * lh /* lock handle for insertion */ ,
+ const struct dentry *name /* name to insert */ ,
+ reiser4_dir_entry_desc * dir_entry /* parameters of new
+ * directory entry */ )
+{
+ reiser4_item_data data;
+ struct cde_entry entry;
+ struct cde_entry_data edata;
+ int result;
+
+ assert("nikita-1656", coord->node == lh->node);
+ assert("nikita-1657", znode_is_write_locked(coord->node));
+
+ edata.num_of_entries = 1;
+ edata.entry = &entry;
+
+ entry.dir = dir;
+ entry.obj = dir_entry->obj;
+ entry.name = &name->d_name;
+
+ data.data = (char *)&edata;
+ data.user = 0; /* &edata is not user space */
+ data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
+ data.arg = dir_entry;
+ assert("nikita-1302", data.iplug != NULL);
+
+ result = is_dot_key(&dir_entry->key);
+ data.length = estimate_cde(result ? coord : NULL, &data);
+
+ inode_add_bytes(dir, cde_bytes(result, &data));
+
+ if (result)
+ result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
+ else
+ result = reiser4_resize_item(coord, &data, &dir_entry->key,
+ lh, 0);
+ return result;
+}
+
+/* ->s.dir.rem_entry() */
+int rem_entry_cde(struct inode *dir /* directory of item */ ,
+ const struct qstr *name, coord_t * coord /* coord of item */ ,
+ lock_handle * lh UNUSED_ARG /* lock handle for
+ * removal */ ,
+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
+ * directory entry
+ * being removed */ )
+{
+ coord_t shadow;
+ int result;
+ int length;
+ ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
+
+ assert("nikita-2870", strlen(name->name) == name->len);
+ assert("nikita-2869",
+ !strcmp(name->name, extract_name_cde(coord, buf)));
+
+ length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
+ if (is_longname(name->name, name->len))
+ length += name->len + 1;
+
+ if (inode_get_bytes(dir) < length) {
+ warning("nikita-2628", "Dir is broke: %llu: %llu",
+ (unsigned long long)get_inode_oid(dir),
+ inode_get_bytes(dir));
+
+ return RETERR(-EIO);
+ }
+
+ /* cut_node() is supposed to take pointers to _different_
+ coords, because it will modify them without respect to
+ possible aliasing. To work around this, create temporary copy
+ of @coord.
+ */
+ coord_dup(&shadow, coord);
+ result =
+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
+ if (result == 0) {
+ inode_sub_bytes(dir, length);
+ }
+ return result;
+}
+
+/* ->s.dir.max_name_len() method for this item plugin */
+int max_name_len_cde(const struct inode *dir /* directory */ )
+{
+ return meta_subvol_tree()->nplug->max_item_size() -
+ sizeof(directory_entry_format) - sizeof(cde_item_format) -
+ sizeof(cde_unit_header) - 2;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/cde.h linux-5.10.2/fs/reiser4/plugin/item/cde.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/cde.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/cde.h 2020-12-23 16:07:46.126813246 +0100
@@ -0,0 +1,87 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Compound directory item. See cde.c for description. */
+
+#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
+#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
+
+#include "../../forward.h"
+#include "../../kassign.h"
+#include "../../dformat.h"
+
+#include <linux/fs.h> /* for struct inode */
+#include <linux/dcache.h> /* for struct dentry, etc */
+
+typedef struct cde_unit_header {
+ de_id hash;
+ d16 offset;
+} cde_unit_header;
+
+typedef struct cde_item_format {
+ d16 num_of_entries;
+ cde_unit_header entry[0];
+} cde_item_format;
+
+struct cde_entry {
+ const struct inode *dir;
+ const struct inode *obj;
+ const struct qstr *name;
+};
+
+struct cde_entry_data {
+ int num_of_entries;
+ struct cde_entry *entry;
+};
+
+/* plugin->item.b.* */
+reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
+int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
+ const reiser4_item_data *);
+int mergeable_cde(const coord_t * p1, const coord_t * p2);
+pos_in_node_t nr_units_cde(const coord_t * coord);
+reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
+int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
+void print_cde(const char *prefix, coord_t * coord);
+int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
+lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
+ coord_t * coord);
+int paste_cde(coord_t * coord, reiser4_item_data * data,
+ carry_plugin_info * info UNUSED_ARG);
+int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
+ shift_direction pend, unsigned *size, unsigned want);
+void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
+ unsigned count, shift_direction where_is_free_space,
+ unsigned free_space);
+int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+void print_cde(const char *prefix, coord_t * coord);
+int reiser4_check_cde(const coord_t * coord, const char **error);
+
+/* plugin->u.item.s.dir.* */
+int extract_key_cde(const coord_t * coord, reiser4_key * key);
+int update_key_cde(const coord_t * coord, const reiser4_key * key,
+ lock_handle * lh);
+char *extract_name_cde(const coord_t * coord, char *buf);
+int add_entry_cde(struct inode *dir, coord_t * coord,
+ lock_handle * lh, const struct dentry *name,
+ reiser4_dir_entry_desc * entry);
+int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
+ lock_handle * lh, reiser4_dir_entry_desc * entry);
+int max_name_len_cde(const struct inode *dir);
+
+/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/ctail.c linux-5.10.2/fs/reiser4/plugin/item/ctail.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/ctail.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/ctail.c 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,1747 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* ctails (aka "clustered tails") are items for cryptcompress objects */
+
+/* DESCRIPTION:
+
+Each cryptcompress object is stored on disk as a set of clusters sliced
+into ctails.
+
+Internal on-disk structure:
+
+ HEADER (1) Here stored disk cluster shift
+ BODY
+*/
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "item.h"
+#include "../node/node.h"
+#include "../plugin.h"
+#include "../object.h"
+#include "../../znode.h"
+#include "../../carry.h"
+#include "../../tree.h"
+#include "../../inode.h"
+#include "../../super.h"
+#include "../../context.h"
+#include "../../page_cache.h"
+#include "../cluster.h"
+#include "../../flush.h"
+#include "../../tree_walk.h"
+
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+
+/* return body of ctail item at @coord */
+static ctail_item_format *ctail_formatted_at(const coord_t * coord)
+{
+ assert("edward-60", coord != NULL);
+ return item_body_by_coord(coord);
+}
+
+static int cluster_shift_by_coord(const coord_t * coord)
+{
+ return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
+}
+
+static inline void dclust_set_extension_shift(hint_t * hint)
+{
+ assert("edward-1270",
+ item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
+ hint->ext_coord.extension.ctail.shift =
+ cluster_shift_by_coord(&hint->ext_coord.coord);
+}
+
+static loff_t off_by_coord(const coord_t * coord)
+{
+ reiser4_key key;
+ return get_key_offset(item_key_by_coord(coord, &key));
+}
+
+int coord_is_unprepped_ctail(const coord_t * coord)
+{
+ assert("edward-1233", coord != NULL);
+ assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
+ assert("edward-1235",
+ ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
+ nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
+
+ return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
+}
+
+static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
+{
+ int shift;
+
+ if (inode != NULL) {
+ shift = inode_cluster_shift(inode);
+ assert("edward-1236",
+ ergo(!coord_is_unprepped_ctail(coord),
+ shift == cluster_shift_by_coord(coord)));
+ } else {
+ assert("edward-1237", !coord_is_unprepped_ctail(coord));
+ shift = cluster_shift_by_coord(coord);
+ }
+ return off_by_coord(coord) >> shift;
+}
+
+static int disk_cluster_size(const coord_t * coord)
+{
+ assert("edward-1156",
+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
+ /* calculation of disk cluster size
+ is meaninless if ctail is unprepped */
+ assert("edward-1238", !coord_is_unprepped_ctail(coord));
+
+ return 1 << cluster_shift_by_coord(coord);
+}
+
+/* true if the key is of first disk cluster item */
+static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
+{
+ assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
+
+ return coord_is_unprepped_ctail(coord) ||
+ ((get_key_offset(key) &
+ ((loff_t) disk_cluster_size(coord) - 1)) == 0);
+}
+
+static char *first_unit(coord_t * coord)
+{
+ /* FIXME: warning: pointer of type `void *' used in arithmetic */
+ return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
+}
+
+/* plugin->u.item.b.max_key_inside :
+ tail_max_key_inside */
+
+/* plugin->u.item.b.can_contain_key */
+int can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
+ const reiser4_item_data * data)
+{
+ reiser4_key item_key;
+
+ if (item_plugin_by_coord(coord) != data->iplug)
+ return 0;
+
+ item_key_by_coord(coord, &item_key);
+ if (get_key_locality(key) != get_key_locality(&item_key) ||
+ get_key_objectid(key) != get_key_objectid(&item_key))
+ return 0;
+ if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
+ get_key_offset(key))
+ return 0;
+ if (is_disk_cluster_key(key, coord))
+ /*
+ * can not merge at the beginning
+ * of a logical cluster in a file
+ */
+ return 0;
+ return 1;
+}
+
+/* plugin->u.item.b.mergeable */
+int mergeable_ctail(const coord_t * p1, const coord_t * p2)
+{
+ reiser4_key key1, key2;
+
+ assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
+ assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
+ FILE_BODY_ITEM_TYPE));
+
+ if (item_id_by_coord(p2) != CTAIL_ID) {
+ /* second item is of another type */
+ return 0;
+ }
+ item_key_by_coord(p1, &key1);
+ item_key_by_coord(p2, &key2);
+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
+ get_key_type(&key1) != get_key_type(&key2)) {
+ /* items of different objects */
+ return 0;
+ }
+ if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
+ /* not adjacent items */
+ return 0;
+ if (is_disk_cluster_key(&key2, p2))
+ /*
+ * can not merge at the beginning
+ * of a logical cluster in a file
+ */
+ return 0;
+ return 1;
+}
+
+/* plugin->u.item.b.nr_units */
+pos_in_node_t nr_units_ctail(const coord_t * coord)
+{
+ return (item_length_by_coord(coord) -
+ sizeof(ctail_formatted_at(coord)->cluster_shift));
+}
+
+/* plugin->u.item.b.estimate:
+ estimate how much space is needed to insert/paste @data->length bytes
+ into ctail at @coord */
+int estimate_ctail(const coord_t * coord /* coord of item */ ,
+ const reiser4_item_data *
+ data /* parameters for new item */ )
+{
+ if (coord == NULL)
+ /* insert */
+ return (sizeof(ctail_item_format) + data->length);
+ else
+ /* paste */
+ return data->length;
+}
+
+/* ->init() method for this item plugin. */
+int init_ctail(coord_t * to /* coord of item */ ,
+ coord_t * from /* old_item */ ,
+ reiser4_item_data * data /* structure used for insertion */ )
+{
+ int cluster_shift; /* cpu value to convert */
+
+ if (data) {
+ assert("edward-463", data->length > sizeof(ctail_item_format));
+ cluster_shift = *((int *)(data->arg));
+ data->length -= sizeof(ctail_item_format);
+ } else {
+ assert("edward-464", from != NULL);
+ assert("edward-855", ctail_ok(from));
+ cluster_shift = (int)(cluster_shift_by_coord(from));
+ }
+ put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
+ assert("edward-856", ctail_ok(to));
+ return 0;
+}
+
+/* plugin->u.item.b.lookup:
+ NULL: We are looking for item keys only */
+
+#if REISER4_DEBUG
+int ctail_ok(const coord_t * coord)
+{
+ return coord_is_unprepped_ctail(coord) ||
+ cluster_shift_ok(cluster_shift_by_coord(coord));
+}
+
+/* plugin->u.item.b.check */
+int check_ctail(const coord_t * coord, const char **error)
+{
+ if (!ctail_ok(coord)) {
+ if (error)
+ *error = "bad cluster shift in ctail";
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+/* plugin->u.item.b.paste */
+int
+paste_ctail(coord_t * coord, reiser4_item_data * data,
+ carry_plugin_info * info UNUSED_ARG)
+{
+ unsigned old_nr_units;
+
+ assert("edward-268", data->data != NULL);
+ /* copy only from kernel space */
+ assert("edward-66", data->user == 0);
+
+ old_nr_units =
+ item_length_by_coord(coord) - sizeof(ctail_item_format) -
+ data->length;
+
+ /* ctail items never get pasted in the middle */
+
+ if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
+
+ /* paste at the beginning when create new item */
+ assert("edward-450",
+ item_length_by_coord(coord) ==
+ data->length + sizeof(ctail_item_format));
+ assert("edward-451", old_nr_units == 0);
+ } else if (coord->unit_pos == old_nr_units - 1
+ && coord->between == AFTER_UNIT) {
+
+ /* paste at the end */
+ coord->unit_pos++;
+ } else
+ impossible("edward-453", "bad paste position");
+
+ memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
+
+ assert("edward-857", ctail_ok(coord));
+
+ return 0;
+}
+
+/* plugin->u.item.b.fast_paste */
+
+/*
+ * plugin->u.item.b.can_shift
+ *
+ * Return number of units that can be shifted;
+ * Store space (in bytes) occupied by those units in @size.
+ */
+int can_shift_ctail(unsigned free_space, coord_t *source,
+ znode * target, shift_direction direction UNUSED_ARG,
+ unsigned *size, unsigned want)
+{
+ /* make sure that that we do not want to shift more than we have */
+ assert("edward-68", want > 0 && want <= nr_units_ctail(source));
+
+ *size = min(want, free_space);
+
+ if (!target) {
+ /*
+ * new item will be created
+ */
+ if (*size <= sizeof(ctail_item_format)) {
+ /*
+ * can not shift only ctail header
+ */
+ *size = 0;
+ return 0;
+ }
+ return *size - sizeof(ctail_item_format);
+ }
+ else
+ /*
+ * shifting to the mergeable item
+ */
+ return *size;
+}
+
+/*
+ * plugin->u.item.b.copy_units
+ * cooperates with ->can_shift()
+ */
+void copy_units_ctail(coord_t * target, coord_t * source,
+ unsigned from, unsigned count /* units */ ,
+ shift_direction where_is_free_space,
+ unsigned free_space /* bytes */ )
+{
+ /* make sure that item @target is expanded already */
+ assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
+ assert("edward-70", free_space == count || free_space == count + 1);
+
+ assert("edward-858", ctail_ok(source));
+
+ if (where_is_free_space == SHIFT_LEFT) {
+ /*
+ * append item @target with @count first bytes
+ * of @source: this restriction came from ordinary tails
+ */
+ assert("edward-71", from == 0);
+ assert("edward-860", ctail_ok(target));
+
+ memcpy(first_unit(target) + nr_units_ctail(target) - count,
+ first_unit(source), count);
+ } else {
+ /*
+ * target item is moved to right already
+ */
+ reiser4_key key;
+
+ assert("edward-72", nr_units_ctail(source) == from + count);
+
+ if (free_space == count) {
+ init_ctail(target, source, NULL);
+ } else {
+ /*
+ * shifting to a mergeable item
+ */
+ assert("edward-862", ctail_ok(target));
+ }
+ memcpy(first_unit(target), first_unit(source) + from, count);
+
+ assert("edward-863", ctail_ok(target));
+ /*
+ * new units are inserted before first unit
+ * in an item, therefore, we have to update
+ * item key
+ */
+ item_key_by_coord(source, &key);
+ set_key_offset(&key, get_key_offset(&key) + from);
+
+ node_plugin_by_node(target->node)->update_item_key(target,
+ &key,
+ NULL /*info */);
+ }
+}
+
+/* plugin->u.item.b.create_hook */
+int create_hook_ctail(const coord_t * coord, void *arg)
+{
+ assert("edward-864", znode_is_loaded(coord->node));
+
+ znode_set_convertible(coord->node);
+ return 0;
+}
+
+/* plugin->u.item.b.kill_hook */
+int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
+ pos_in_node_t count, carry_kill_data * kdata)
+{
+ struct inode *inode;
+
+ assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
+ assert("edward-291", znode_is_write_locked(coord->node));
+
+ inode = kdata->inode;
+ if (inode) {
+ reiser4_key key;
+ struct cryptcompress_info * info;
+ cloff_t index;
+
+ item_key_by_coord(coord, &key);
+ info = cryptcompress_inode_data(inode);
+ index = off_to_clust(get_key_offset(&key), inode);
+
+ if (from == 0) {
+ info->trunc_index = index;
+ if (is_disk_cluster_key(&key, coord)) {
+ /*
+ * first item of disk cluster is to be killed
+ */
+ truncate_complete_page_cluster(
+ inode, index, kdata->params.truncate);
+ inode_sub_bytes(inode,
+ inode_cluster_size(inode));
+ }
+ }
+ }
+ return 0;
+}
+
+/* for shift_hook_ctail(),
+ return true if the first disk cluster item has dirty child
+*/
+static int ctail_convertible(const coord_t * coord)
+{
+ int result;
+ reiser4_key key;
+ jnode *child = NULL;
+
+ assert("edward-477", coord != NULL);
+ assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
+
+ if (coord_is_unprepped_ctail(coord))
+ /* unprepped ctail should be converted */
+ return 1;
+
+ item_key_by_coord(coord, &key);
+ child = jlookup(get_key_objectid(&key), off_to_pg(off_by_coord(coord)));
+ if (!child)
+ return 0;
+ result = JF_ISSET(child, JNODE_DIRTY);
+ jput(child);
+ return result;
+}
+
+/* FIXME-EDWARD */
+/* plugin->u.item.b.shift_hook */
+int shift_hook_ctail(const coord_t * item /* coord of item */ ,
+ unsigned from UNUSED_ARG /* start unit */ ,
+ unsigned count UNUSED_ARG /* stop unit */ ,
+ znode * old_node /* old parent */ )
+{
+ assert("edward-479", item != NULL);
+ assert("edward-480", item->node != old_node);
+
+ if (!znode_convertible(old_node) || znode_convertible(item->node))
+ return 0;
+ if (ctail_convertible(item))
+ znode_set_convertible(item->node);
+ return 0;
+}
+
+static int
+cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ int cut, void *p, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ pos_in_node_t count; /* number of units to cut */
+ char *item;
+
+ count = to - from + 1;
+ item = item_body_by_coord(coord);
+
+ assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
+
+ if (smallest_removed) {
+ /* store smallest key removed */
+ item_key_by_coord(coord, smallest_removed);
+ set_key_offset(smallest_removed,
+ get_key_offset(smallest_removed) + from);
+ }
+
+ if (new_first) {
+ assert("vs-1531", from == 0);
+
+ item_key_by_coord(coord, new_first);
+ set_key_offset(new_first,
+ get_key_offset(new_first) + from + count);
+ }
+
+ if (!cut)
+ kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
+
+ if (from == 0) {
+ if (count != nr_units_ctail(coord)) {
+ /* part of item is removed, so move free space at the beginning
+ of the item and update item key */
+ reiser4_key key;
+ memcpy(item + to + 1, item, sizeof(ctail_item_format));
+ item_key_by_coord(coord, &key);
+ set_key_offset(&key, get_key_offset(&key) + count);
+ node_plugin_by_node(coord->node)->update_item_key(coord,
+ &key,
+ NULL);
+ } else {
+ /* cut_units should not be called to cut evrything */
+ assert("vs-1532", ergo(cut, 0));
+ /* whole item is cut, so more then amount of space occupied
+ by units got freed */
+ count += sizeof(ctail_item_format);
+ }
+ }
+ return count;
+}
+
+/* plugin->u.item.b.cut_units */
+int
+cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
+ carry_cut_data * cdata, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ return cut_or_kill_ctail_units(item, from, to, 1, NULL,
+ smallest_removed, new_first);
+}
+
+/* plugin->u.item.b.kill_units */
+int
+kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ return cut_or_kill_ctail_units(item, from, to, 0, kdata,
+ smallest_removed, new_first);
+}
+
+/* plugin->u.item.s.file.read */
+int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
+{
+ uf_coord_t *uf_coord;
+ coord_t *coord;
+
+ uf_coord = &hint->ext_coord;
+ coord = &uf_coord->coord;
+ assert("edward-127", f->user == 0);
+ assert("edward-129", coord && coord->node);
+ assert("edward-130", coord_is_existing_unit(coord));
+ assert("edward-132", znode_is_loaded(coord->node));
+
+ /* start read only from the beginning of ctail */
+ assert("edward-133", coord->unit_pos == 0);
+ /* read only whole ctails */
+ assert("edward-135", nr_units_ctail(coord) <= f->length);
+
+ assert("edward-136", reiser4_schedulable());
+ assert("edward-886", ctail_ok(coord));
+
+ if (f->data)
+ memcpy(f->data, (char *)first_unit(coord),
+ (size_t) nr_units_ctail(coord));
+
+ dclust_set_extension_shift(hint);
+ mark_page_accessed(znode_page(coord->node));
+ move_flow_forward(f, nr_units_ctail(coord));
+
+ return 0;
+}
+
+/**
+ * Prepare transform stream with plain text for page
+ * @page taking into account synchronization issues.
+ */
+static int ctail_read_disk_cluster(struct cluster_handle * clust,
+ struct inode * inode, struct page * page,
+ znode_lock_mode mode)
+{
+ int result;
+
+ assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
+ assert("edward-671", clust->hint != NULL);
+ assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
+ assert("edward-672", cryptcompress_inode_ok(inode));
+ assert("edward-1527", PageLocked(page));
+
+ unlock_page(page);
+
+ /* set input stream */
+ result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
+ if (result) {
+ lock_page(page);
+ return result;
+ }
+ result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
+ lock_page(page);
+ if (result)
+ return result;
+ /*
+ * at this point we have locked position in the tree
+ */
+ assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
+
+ if (page->mapping != inode->i_mapping) {
+ /* page was truncated */
+ reiser4_unset_hint(clust->hint);
+ reset_cluster_params(clust);
+ return AOP_TRUNCATED_PAGE;
+ }
+ if (PageUptodate(page)) {
+ /* disk cluster can be obsolete, don't use it! */
+ reiser4_unset_hint(clust->hint);
+ reset_cluster_params(clust);
+ return 0;
+ }
+ if (clust->dstat == FAKE_DISK_CLUSTER ||
+ clust->dstat == UNPR_DISK_CLUSTER ||
+ clust->dstat == TRNC_DISK_CLUSTER) {
+ /*
+ * this information about disk cluster will be valid
+ * as long as we keep the position in the tree locked
+ */
+ tfm_cluster_set_uptodate(&clust->tc);
+ return 0;
+ }
+ /* now prepare output stream.. */
+ result = grab_coa(&clust->tc, inode_compression_plugin(inode));
+ if (result)
+ return result;
+ /* ..and fill this with plain text */
+ result = reiser4_inflate_cluster(clust, inode);
+ if (result)
+ return result;
+ /*
+ * The stream is ready! It won't be obsolete as
+ * long as we keep last disk cluster item locked.
+ */
+ tfm_cluster_set_uptodate(&clust->tc);
+ return 0;
+}
+
+/*
+ * fill one page with plain text.
+ */
+int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
+ struct page *page, znode_lock_mode mode)
+{
+ int ret;
+ unsigned cloff;
+ char *data;
+ size_t to_page;
+ struct tfm_cluster * tc = &clust->tc;
+
+ assert("edward-212", PageLocked(page));
+
+ if (unlikely(page->mapping != inode->i_mapping))
+ return AOP_TRUNCATED_PAGE;
+ if (PageUptodate(page))
+ goto exit;
+ to_page = pbytes(page_index(page), inode);
+ if (to_page == 0) {
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ goto exit;
+ }
+ if (!tfm_cluster_is_uptodate(&clust->tc)) {
+ clust->index = pg_to_clust(page->index, inode);
+
+ /* this will unlock/lock the page */
+ ret = ctail_read_disk_cluster(clust, inode, page, mode);
+
+ assert("edward-212", PageLocked(page));
+ if (ret)
+ return ret;
+
+ /* refresh bytes */
+ to_page = pbytes(page_index(page), inode);
+ if (to_page == 0) {
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ goto exit;
+ }
+ }
+ if (PageUptodate(page))
+ /* somebody else fill it already */
+ goto exit;
+
+ assert("edward-119", tfm_cluster_is_uptodate(tc));
+ assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
+
+ switch (clust->dstat) {
+ case UNPR_DISK_CLUSTER:
+ /*
+ * Page is not uptodate and item cluster is unprepped:
+ * this must not ever happen.
+ */
+ warning("edward-1632",
+ "Bad item cluster %lu (Inode %llu). Fsck?",
+ clust->index,
+ (unsigned long long)get_inode_oid(inode));
+ return RETERR(-EIO);
+ case TRNC_DISK_CLUSTER:
+ /*
+ * Race with truncate!
+ * We resolve it in favour of the last one (the only way,
+ * as in this case plain text is unrecoverable)
+ */
+ case FAKE_DISK_CLUSTER:
+ /* fill the page by zeroes */
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ break;
+ case PREP_DISK_CLUSTER:
+ /* fill page by transformed stream with plain text */
+ assert("edward-1058", !PageUptodate(page));
+ assert("edward-120", tc->len <= inode_cluster_size(inode));
+
+ /* page index in this logical cluster */
+ cloff = pg_to_off_to_cloff(page->index, inode);
+
+ data = kmap(page);
+ memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
+ memset(data + to_page, 0, (size_t) PAGE_SIZE - to_page);
+ flush_dcache_page(page);
+ kunmap(page);
+ SetPageUptodate(page);
+ break;
+ default:
+ impossible("edward-1169", "bad disk cluster state");
+ }
+ exit:
+ return 0;
+}
+
+/* plugin->u.item.s.file.readpage */
+int readpage_ctail(void *vp, struct page *page)
+{
+ int result;
+ hint_t * hint;
+ struct cluster_handle * clust = vp;
+
+ assert("edward-114", clust != NULL);
+ assert("edward-115", PageLocked(page));
+ assert("edward-116", !PageUptodate(page));
+ assert("edward-118", page->mapping && page->mapping->host);
+ assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
+
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL) {
+ unlock_page(page);
+ return RETERR(-ENOMEM);
+ }
+ clust->hint = hint;
+ result = load_file_hint(clust->file, hint);
+ if (result) {
+ kfree(hint);
+ unlock_page(page);
+ return result;
+ }
+ assert("vs-25", hint->ext_coord.lh == &hint->lh);
+
+ result = do_readpage_ctail(page->mapping->host, clust, page,
+ ZNODE_READ_LOCK);
+ assert("edward-213", PageLocked(page));
+ assert("edward-1163", ergo(!result, PageUptodate(page)));
+
+ unlock_page(page);
+ done_lh(&hint->lh);
+ hint->ext_coord.valid = 0;
+ save_file_hint(clust->file, hint);
+ kfree(hint);
+ tfm_cluster_clr_uptodate(&clust->tc);
+
+ return result;
+}
+
+/* Helper function for ->readpages() */
+static int ctail_read_page_cluster(struct cluster_handle * clust,
+ struct inode *inode)
+{
+ int i;
+ int result;
+ assert("edward-779", clust != NULL);
+ assert("edward-1059", clust->win == NULL);
+ assert("edward-780", inode != NULL);
+
+ result = prepare_page_cluster(inode, clust, READ_OP);
+ if (result)
+ return result;
+
+ assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
+
+ for (i = 0; i < clust->nr_pages; i++) {
+ struct page *page = clust->pages[i];
+ lock_page(page);
+ result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
+ unlock_page(page);
+ if (result)
+ break;
+ }
+ tfm_cluster_clr_uptodate(&clust->tc);
+ put_page_cluster(clust, inode, READ_OP);
+ return result;
+}
+
+/* filler for read_cache_pages() */
+static int ctail_readpages_filler(void * data, struct page * page)
+{
+ int ret = 0;
+ struct cluster_handle * clust = data;
+ struct inode * inode = file_inode(clust->file);
+
+ assert("edward-1525", page->mapping == inode->i_mapping);
+
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return 0;
+ }
+ if (pbytes(page_index(page), inode) == 0) {
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+ move_cluster_forward(clust, inode, page->index);
+ unlock_page(page);
+ /*
+ * read the whole page cluster
+ */
+ ret = ctail_read_page_cluster(clust, inode);
+
+ assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
+ return ret;
+}
+
+/*
+ * We populate a bit more then upper readahead suggests:
+ * with each nominated page we read the whole page cluster
+ * this page belongs to.
+ */
+int readpages_ctail(struct file *file, struct address_space *mapping,
+ struct list_head *pages)
+{
+ int ret = 0;
+ hint_t *hint;
+ struct cluster_handle clust;
+ struct inode *inode = mapping->host;
+
+ assert("edward-1521", inode == file_inode(file));
+
+ cluster_init_read(&clust, NULL);
+ clust.file = file;
+ hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
+ if (hint == NULL) {
+ warning("vs-28", "failed to allocate hint");
+ ret = RETERR(-ENOMEM);
+ goto exit1;
+ }
+ clust.hint = hint;
+ ret = load_file_hint(clust.file, hint);
+ if (ret) {
+ warning("edward-1522", "failed to load hint");
+ goto exit2;
+ }
+ assert("vs-26", hint->ext_coord.lh == &hint->lh);
+ ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
+ if (ret) {
+ warning("edward-1523", "failed to alloc pgset");
+ goto exit3;
+ }
+ ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
+
+ assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
+ exit3:
+ done_lh(&hint->lh);
+ save_file_hint(file, hint);
+ hint->ext_coord.valid = 0;
+ exit2:
+ kfree(hint);
+ exit1:
+ put_cluster_handle(&clust);
+ return ret;
+}
+
+reiser4_key *append_key_ctail(const coord_t *coord, reiser4_key *key)
+{
+ assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
+ assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
+
+ item_key_by_coord(coord, key);
+ set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
+ << cluster_shift_by_coord(coord));
+ return key;
+}
+
+static int insert_unprepped_ctail(struct cluster_handle * clust,
+ struct inode *inode)
+{
+ int result;
+ char buf[UCTAIL_NR_UNITS];
+ reiser4_item_data data;
+ reiser4_key key;
+ int shift = (int)UCTAIL_SHIFT;
+
+ memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
+ result = build_body_key_cryptcompress(inode,
+ clust_to_off(clust->index, inode),
+ &key);
+ if (result)
+ return result;
+ data.user = 0;
+ data.iplug = item_plugin_by_id(CTAIL_ID);
+ data.arg = &shift;
+ data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
+ data.data = buf;
+
+ result = insert_by_coord(&clust->hint->ext_coord.coord,
+ &data, &key, clust->hint->ext_coord.lh, 0);
+ return result;
+}
+
+static int
+insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
+ int cluster_shift)
+{
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ reiser4_item_data *data;
+ carry_op *op;
+
+ pool =
+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(*data));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+ data = (reiser4_item_data *) (lowest_level + 3);
+
+ assert("edward-466", coord->between == AFTER_ITEM
+ || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
+ || coord->between == EMPTY_NODE
+ || coord->between == BEFORE_UNIT);
+
+ if (coord->between == AFTER_UNIT) {
+ coord->unit_pos = 0;
+ coord->between = AFTER_ITEM;
+ }
+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
+ 0 /* operate directly on coord -> node */);
+ if (IS_ERR(op) || (op == NULL)) {
+ done_carry_pool(pool);
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+ }
+ data->user = 0;
+ data->iplug = item_plugin_by_id(CTAIL_ID);
+ data->arg = &cluster_shift;
+
+ data->length = 0;
+ data->data = NULL;
+
+ op->u.insert_flow.flags =
+ COPI_SWEEP |
+ COPI_DONT_SHIFT_LEFT |
+ COPI_DONT_SHIFT_RIGHT;
+ op->u.insert_flow.insert_point = coord;
+ op->u.insert_flow.flow = f;
+ op->u.insert_flow.data = data;
+ op->u.insert_flow.new_nodes = 0;
+
+ lowest_level->track_type = CARRY_TRACK_CHANGE;
+ lowest_level->tracked = lh;
+
+ result = reiser4_carry(lowest_level, NULL);
+ done_carry_pool(pool);
+
+ return result;
+}
+
+/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
+static int insert_cryptcompress_flow_in_place(coord_t * coord,
+ lock_handle * lh, flow_t * f,
+ int cluster_shift)
+{
+ int ret;
+ coord_t pos;
+ lock_handle lock;
+
+ assert("edward-484",
+ coord->between == AT_UNIT || coord->between == AFTER_ITEM);
+ assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
+
+ coord_dup(&pos, coord);
+ pos.unit_pos = 0;
+ pos.between = AFTER_ITEM;
+
+ init_lh(&lock);
+ copy_lh(&lock, lh);
+
+ ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
+ done_lh(&lock);
+ assert("edward-1347", znode_is_write_locked(lh->node));
+ assert("edward-1228", !ret);
+ return ret;
+}
+
+/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
+static int overwrite_ctail(coord_t * coord, flow_t * f)
+{
+ unsigned count;
+
+ assert("edward-269", f->user == 0);
+ assert("edward-270", f->data != NULL);
+ assert("edward-271", f->length > 0);
+ assert("edward-272", coord_is_existing_unit(coord));
+ assert("edward-273", coord->unit_pos == 0);
+ assert("edward-274", znode_is_write_locked(coord->node));
+ assert("edward-275", reiser4_schedulable());
+ assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
+ assert("edward-1243", ctail_ok(coord));
+
+ count = nr_units_ctail(coord);
+
+ if (count > f->length)
+ count = f->length;
+ memcpy(first_unit(coord), f->data, count);
+ move_flow_forward(f, count);
+ coord->unit_pos += count;
+ return 0;
+}
+
+/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
+ cut ctail (part or whole) starting from next unit position */
+static int cut_ctail(coord_t * coord)
+{
+ coord_t stop;
+
+ assert("edward-435", coord->between == AT_UNIT &&
+ coord->item_pos < coord_num_items(coord) &&
+ coord->unit_pos <= coord_num_units(coord));
+
+ if (coord->unit_pos == coord_num_units(coord))
+ /* nothing to cut */
+ return 0;
+ coord_dup(&stop, coord);
+ stop.unit_pos = coord_last_unit_pos(coord);
+
+ return cut_node_content(coord, &stop, NULL, NULL, NULL);
+}
+
+int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
+ struct inode * inode)
+{
+ int result;
+ assert("edward-1244", inode != NULL);
+ assert("edward-1245", clust->hint != NULL);
+ assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
+ assert("edward-1247", clust->reserved == 1);
+
+ result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
+ if (cbk_errored(result))
+ return result;
+ assert("edward-1249", result == CBK_COORD_NOTFOUND);
+ assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
+
+ assert("edward-1295",
+ clust->hint->ext_coord.lh->node ==
+ clust->hint->ext_coord.coord.node);
+
+ coord_set_between_clusters(&clust->hint->ext_coord.coord);
+
+ result = insert_unprepped_ctail(clust, inode);
+ all_grabbed2free();
+
+ assert("edward-1251", !result);
+ assert("edward-1252", cryptcompress_inode_ok(inode));
+ assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
+ assert("edward-1254",
+ reiser4_subvol_clustered_blocks(get_meta_subvol()) != 0);
+ assert("edward-1255",
+ znode_convertible(clust->hint->ext_coord.coord.node));
+
+ return result;
+}
+
+/* plugin->u.item.f.scan */
+int scan_ctail(flush_scan * scan)
+{
+ int result = 0;
+ struct page *page;
+ struct inode *inode;
+ jnode *node = scan->node;
+
+ assert("edward-227", scan->node != NULL);
+ assert("edward-228", jnode_is_cluster_page(scan->node));
+ assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
+
+ page = jnode_page(node);
+ inode = page->mapping->host;
+
+ if (!reiser4_scanning_left(scan))
+ return result;
+
+ if (!znode_convertible(scan->parent_lock.node)) {
+ if (JF_ISSET(scan->node, JNODE_DIRTY))
+ znode_set_convertible(scan->parent_lock.node);
+ else {
+ warning("edward-681",
+ "cluster page is already processed");
+ return -EAGAIN;
+ }
+ }
+ return result;
+}
+
+/* If true, this function attaches children */
+static int should_attach_convert_idata(flush_pos_t * pos)
+{
+ int result;
+ assert("edward-431", pos != NULL);
+ assert("edward-432", pos->child == NULL);
+ assert("edward-619", znode_is_write_locked(pos->coord.node));
+ assert("edward-470",
+ item_plugin_by_coord(&pos->coord) ==
+ item_plugin_by_id(CTAIL_ID));
+
+ /* check for leftmost child */
+ utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
+
+ if (!pos->child)
+ return 0;
+ spin_lock_jnode(pos->child);
+ result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
+ pos->child->atom == ZJNODE(pos->coord.node)->atom);
+ spin_unlock_jnode(pos->child);
+ if (!result && pos->child) {
+ /* existing child isn't to attach, clear up this one */
+ jput(pos->child);
+ pos->child = NULL;
+ }
+ return result;
+}
+
+/**
+ * Collect all needed information about the object here,
+ * as in-memory inode can be evicted from memory before
+ * disk update completion.
+ */
+static int init_convert_data_ctail(struct convert_item_info * idata,
+ struct inode *inode)
+{
+ assert("edward-813", idata != NULL);
+ assert("edward-814", inode != NULL);
+
+ idata->cluster_shift = inode_cluster_shift(inode);
+ idata->d_cur = DC_FIRST_ITEM;
+ idata->d_next = DC_INVALID_STATE;
+
+ return 0;
+}
+
+static int alloc_item_convert_data(struct convert_info * sq)
+{
+ assert("edward-816", sq != NULL);
+ assert("edward-817", sq->itm == NULL);
+
+ sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
+ if (sq->itm == NULL)
+ return RETERR(-ENOMEM);
+ init_lh(&sq->right_lock);
+ sq->right_locked = 0;
+ return 0;
+}
+
+static void free_item_convert_data(struct convert_info * sq)
+{
+ assert("edward-818", sq != NULL);
+ assert("edward-819", sq->itm != NULL);
+ assert("edward-820", sq->iplug != NULL);
+
+ done_lh(&sq->right_lock);
+ sq->right_locked = 0;
+ kfree(sq->itm);
+ sq->itm = NULL;
+ return;
+}
+
+static struct convert_info *alloc_convert_data(void)
+{
+ struct convert_info *info;
+
+ info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
+ if (info != NULL) {
+ memset(info, 0, sizeof(*info));
+ cluster_init_write(&info->clust, NULL);
+ }
+ return info;
+}
+
+static void reset_convert_data(struct convert_info *info)
+{
+ info->clust.tc.hole = 0;
+}
+
+void free_convert_data(flush_pos_t * pos)
+{
+ struct convert_info *sq;
+
+ assert("edward-823", pos != NULL);
+ assert("edward-824", pos->sq != NULL);
+
+ sq = pos->sq;
+ if (sq->itm)
+ free_item_convert_data(sq);
+ put_cluster_handle(&sq->clust);
+ kfree(pos->sq);
+ pos->sq = NULL;
+ return;
+}
+
+static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
+{
+ struct convert_info *sq;
+
+ assert("edward-825", pos != NULL);
+ assert("edward-826", pos->sq != NULL);
+ assert("edward-827", item_convert_data(pos) != NULL);
+ assert("edward-828", inode != NULL);
+
+ sq = pos->sq;
+ memset(sq->itm, 0, sizeof(*sq->itm));
+
+ /* iplug->init_convert_data() */
+ return init_convert_data_ctail(sq->itm, inode);
+}
+
+/* create and attach disk cluster info used by 'convert' phase of the flush
+ squalloc() */
+static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
+{
+ int ret = 0;
+ struct convert_item_info *info;
+ struct cluster_handle *clust;
+
+ assert("edward-248", pos != NULL);
+ assert("edward-249", pos->child != NULL);
+ assert("edward-251", inode != NULL);
+ assert("edward-682", cryptcompress_inode_ok(inode));
+ assert("edward-252",
+ inode_file_plugin(inode) ==
+ file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
+ assert("edward-473",
+ item_plugin_by_coord(&pos->coord) ==
+ item_plugin_by_id(CTAIL_ID));
+
+ if (!pos->sq) {
+ pos->sq = alloc_convert_data();
+ if (!pos->sq)
+ return RETERR(-ENOMEM);
+ }
+ else
+ reset_convert_data(pos->sq);
+
+ clust = &pos->sq->clust;
+
+ ret = set_cluster_by_page(clust,
+ jnode_page(pos->child),
+ MAX_CLUSTER_NRPAGES);
+ if (ret)
+ goto err;
+
+ assert("edward-829", pos->sq != NULL);
+ assert("edward-250", item_convert_data(pos) == NULL);
+
+ pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
+
+ ret = alloc_item_convert_data(pos->sq);
+ if (ret)
+ goto err;
+ ret = init_item_convert_data(pos, inode);
+ if (ret)
+ goto err;
+ info = item_convert_data(pos);
+
+ ret = checkout_logical_cluster(clust, pos->child, inode);
+ if (ret)
+ goto err;
+
+ reiser4_deflate_cluster(clust, inode);
+ inc_item_convert_count(pos);
+
+ /* prepare flow for insertion */
+ flow_by_inode_cryptcompress(inode,
+ (const char __user *)tfm_stream_data(&clust->tc,
+ OUTPUT_STREAM),
+ 0, /* kernel space */
+ clust->tc.len,
+ clust_to_off(clust->index, inode),
+ WRITE_OP, &info->flow);
+ if (clust->tc.hole)
+ info->flow.length = 0;
+
+ jput(pos->child);
+ return 0;
+ err:
+ jput(pos->child);
+ free_convert_data(pos);
+ return ret;
+}
+
+/* clear up disk cluster info */
+static void detach_convert_idata(struct convert_info * sq)
+{
+ struct convert_item_info *info;
+
+ assert("edward-253", sq != NULL);
+ assert("edward-840", sq->itm != NULL);
+
+ info = sq->itm;
+ assert("edward-1212", info->flow.length == 0);
+
+ free_item_convert_data(sq);
+ return;
+}
+
+/* plugin->u.item.f.utmost_child */
+
+/* This function sets leftmost child for a first cluster item,
+ if the child exists, and NULL in other cases.
+ NOTE-EDWARD: Do not call this for RIGHT_SIDE */
+
+int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
+{
+ reiser4_key key;
+
+ item_key_by_coord(coord, &key);
+
+ assert("edward-257", coord != NULL);
+ assert("edward-258", child != NULL);
+ assert("edward-259", side == LEFT_SIDE);
+ assert("edward-260",
+ item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
+
+ if (!is_disk_cluster_key(&key, coord))
+ *child = NULL;
+ else
+ *child = jlookup(get_key_objectid(item_key_by_coord(coord,
+ &key)),
+ off_to_pg(get_key_offset(&key)));
+ return 0;
+}
+
+/*
+ * Set status (d_next) of the first item at the right neighbor
+ *
+ * If the current position is the last item in the node, then
+ * look at its first item at the right neighbor (skip empty nodes).
+ * Note, that right neighbors may be not dirty because of races.
+ * If so, make it dirty and set convertible flag.
+ */
+static int pre_convert_ctail(flush_pos_t * pos)
+{
+ int ret = 0;
+ int stop = 0;
+ znode *slider;
+ lock_handle slider_lh;
+ lock_handle right_lh;
+
+ assert("edward-1232", !node_is_empty(pos->coord.node));
+ assert("edward-1014",
+ pos->coord.item_pos < coord_num_items(&pos->coord));
+ assert("edward-1015", convert_data_attached(pos));
+ assert("edward-1611",
+ item_convert_data(pos)->d_cur != DC_INVALID_STATE);
+ assert("edward-1017",
+ item_convert_data(pos)->d_next == DC_INVALID_STATE);
+
+ /*
+ * In the following two cases we don't need
+ * to look at right neighbor
+ */
+ if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER) {
+ /*
+ * cluster is over, so the first item of the right
+ * neighbor doesn't belong to this cluster
+ */
+ return 0;
+ }
+ if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1) {
+ /*
+ * current position is not the last item in the node,
+ * so the first item of the right neighbor doesn't
+ * belong to this cluster
+ */
+ return 0;
+ }
+ /*
+ * Look at right neighbor.
+ * Note that concurrent truncate is not a problem
+ * since we have locked the beginning of the cluster.
+ */
+ slider = pos->coord.node;
+ init_lh(&slider_lh);
+ init_lh(&right_lh);
+
+ while (!stop) {
+ coord_t coord;
+
+ ret = reiser4_get_right_neighbor(&right_lh,
+ slider,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (ret)
+ break;
+ slider = right_lh.node;
+ ret = zload(slider);
+ if (ret)
+ break;
+ coord_init_before_first_item(&coord, slider);
+
+ if (node_is_empty(slider)) {
+ warning("edward-1641", "Found empty right neighbor");
+ znode_set_convertible(slider);
+ /*
+ * skip this node,
+ * go rightward
+ */
+ stop = 0;
+ } else if (same_disk_cluster(&pos->coord, &coord)) {
+
+ item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
+
+ if (!znode_convertible(slider)) {
+ /*
+ warning("edward-1272",
+ "next slum item mergeable, "
+ "but znode %p isn't convertible\n",
+ lh.node);
+ */
+ znode_set_convertible(slider);
+ }
+ stop = 1;
+ convert_data(pos)->right_locked = 1;
+ } else {
+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
+ stop = 1;
+ convert_data(pos)->right_locked = 1;
+ }
+ zrelse(slider);
+ done_lh(&slider_lh);
+ move_lh(&slider_lh, &right_lh);
+ }
+ if (convert_data(pos)->right_locked)
+ /*
+ * Store locked right neighbor in
+ * the conversion info. Otherwise,
+ * we won't be able to access it,
+ * if the current node gets deleted
+ * during conversion
+ */
+ move_lh(&convert_data(pos)->right_lock, &slider_lh);
+ done_lh(&slider_lh);
+ done_lh(&right_lh);
+
+ if (ret == -E_NO_NEIGHBOR) {
+ item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
+ ret = 0;
+ }
+ assert("edward-1610",
+ ergo(ret != 0,
+ item_convert_data(pos)->d_next == DC_INVALID_STATE));
+ return ret;
+}
+
+/*
+ * do some post-conversion actions;
+ * detach conversion data if there is nothing to convert anymore
+ */
+static void post_convert_ctail(flush_pos_t * pos,
+ ctail_convert_mode_t mode, int old_nr_items)
+{
+ switch (mode) {
+ case CTAIL_CUT_ITEM:
+ assert("edward-1214", item_convert_data(pos)->flow.length == 0);
+ assert("edward-1215",
+ coord_num_items(&pos->coord) == old_nr_items ||
+ coord_num_items(&pos->coord) == old_nr_items - 1);
+
+ if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
+ /*
+ * the next item belongs to this cluster,
+ * and should be also killed
+ */
+ break;
+ if (coord_num_items(&pos->coord) != old_nr_items) {
+ /*
+ * the latest item in the
+ * cluster has been killed,
+ */
+ detach_convert_idata(pos->sq);
+ if (!node_is_empty(pos->coord.node))
+ /*
+ * make sure the next item will be scanned
+ */
+ coord_init_before_item(&pos->coord);
+ break;
+ }
+ /* fall through */
+ case CTAIL_APPEND_ITEM:
+ /*
+ * in the append mode the whole flow has been inserted
+ * (see COP_INSERT_FLOW primitive)
+ */
+ assert("edward-434", item_convert_data(pos)->flow.length == 0);
+ detach_convert_idata(pos->sq);
+ break;
+ case CTAIL_OVERWRITE_ITEM:
+ if (coord_is_unprepped_ctail(&pos->coord)) {
+ /*
+ * the first (unprepped) ctail has been overwritten;
+ * convert it to the prepped one
+ */
+ assert("edward-1259",
+ cluster_shift_ok(item_convert_data(pos)->
+ cluster_shift));
+ put_unaligned((d8)item_convert_data(pos)->cluster_shift,
+ &ctail_formatted_at(&pos->coord)->
+ cluster_shift);
+ }
+ break;
+ default:
+ impossible("edward-1609", "Bad ctail conversion mode");
+ }
+}
+
+static int assign_conversion_mode(flush_pos_t * pos, ctail_convert_mode_t *mode)
+{
+ int ret = 0;
+
+ *mode = CTAIL_INVAL_CONVERT_MODE;
+
+ if (!convert_data_attached(pos)) {
+ if (should_attach_convert_idata(pos)) {
+ struct inode *inode;
+ gfp_t old_mask = get_current_context()->gfp_mask;
+
+ assert("edward-264", pos->child != NULL);
+ assert("edward-265", jnode_page(pos->child) != NULL);
+ assert("edward-266",
+ jnode_page(pos->child)->mapping != NULL);
+
+ inode = jnode_page(pos->child)->mapping->host;
+
+ assert("edward-267", inode != NULL);
+ /*
+ * attach new convert item info
+ */
+ get_current_context()->gfp_mask |= __GFP_NOFAIL;
+ ret = attach_convert_idata(pos, inode);
+ get_current_context()->gfp_mask = old_mask;
+ pos->child = NULL;
+ if (ret == -E_REPEAT) {
+ /*
+ * jnode became clean, or there is no dirty
+ * pages (nothing to update in disk cluster)
+ */
+ warning("edward-1021",
+ "convert_ctail: nothing to attach");
+ ret = 0;
+ goto dont_convert;
+ }
+ if (ret)
+ goto dont_convert;
+
+ if (pos->sq->clust.tc.hole) {
+ assert("edward-1634",
+ item_convert_data(pos)->flow.length == 0);
+ /*
+ * new content is filled with zeros -
+ * we punch a hole using cut (not kill)
+ * primitive, so attached pages won't
+ * be truncated
+ */
+ *mode = CTAIL_CUT_ITEM;
+ }
+ else
+ /*
+ * this is the first ctail in the cluster,
+ * so it (may be only its head) should be
+ * overwritten
+ */
+ *mode = CTAIL_OVERWRITE_ITEM;
+ } else
+ /*
+ * non-convertible item
+ */
+ goto dont_convert;
+ } else {
+ /*
+ * use old convert info
+ */
+ struct convert_item_info *idata;
+ idata = item_convert_data(pos);
+
+ switch (idata->d_cur) {
+ case DC_FIRST_ITEM:
+ case DC_CHAINED_ITEM:
+ if (idata->flow.length)
+ *mode = CTAIL_OVERWRITE_ITEM;
+ else
+ *mode = CTAIL_CUT_ITEM;
+ break;
+ case DC_AFTER_CLUSTER:
+ if (idata->flow.length)
+ *mode = CTAIL_APPEND_ITEM;
+ else {
+ /*
+ * nothing to update anymore
+ */
+ detach_convert_idata(pos->sq);
+ goto dont_convert;
+ }
+ break;
+ default:
+ impossible("edward-1018",
+ "wrong current item state");
+ ret = RETERR(-EIO);
+ goto dont_convert;
+ }
+ }
+ /*
+ * ok, ctail will be converted
+ */
+ assert("edward-433", convert_data_attached(pos));
+ assert("edward-1022",
+ pos->coord.item_pos < coord_num_items(&pos->coord));
+ return 0;
+ dont_convert:
+ return ret;
+}
+
+/*
+ * perform an operation on the ctail item in
+ * accordance with assigned conversion @mode
+ */
+static int do_convert_ctail(flush_pos_t * pos, ctail_convert_mode_t mode)
+{
+ int result = 0;
+ struct convert_item_info * info;
+
+ assert("edward-468", pos != NULL);
+ assert("edward-469", pos->sq != NULL);
+ assert("edward-845", item_convert_data(pos) != NULL);
+
+ info = item_convert_data(pos);
+ assert("edward-679", info->flow.data != NULL);
+
+ switch (mode) {
+ case CTAIL_APPEND_ITEM:
+ assert("edward-1229", info->flow.length != 0);
+ assert("edward-1256",
+ cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
+ /*
+ * insert flow without balancing
+ * (see comments to convert_node())
+ */
+ result = insert_cryptcompress_flow_in_place(&pos->coord,
+ &pos->lock,
+ &info->flow,
+ info->cluster_shift);
+ break;
+ case CTAIL_OVERWRITE_ITEM:
+ assert("edward-1230", info->flow.length != 0);
+ overwrite_ctail(&pos->coord, &info->flow);
+ if (info->flow.length != 0)
+ break;
+ /* fall through */
+ /* cut the rest of item (if any) */
+ case CTAIL_CUT_ITEM:
+ assert("edward-1231", info->flow.length == 0);
+ result = cut_ctail(&pos->coord);
+ break;
+ default:
+ result = RETERR(-EIO);
+ impossible("edward-244", "bad ctail conversion mode");
+ }
+ return result;
+}
+
+/*
+ * plugin->u.item.f.convert
+ *
+ * Convert ctail items at flush time
+ */
+int convert_ctail(flush_pos_t * pos)
+{
+ int ret;
+ int old_nr_items;
+ ctail_convert_mode_t mode;
+
+ assert("edward-1020", pos != NULL);
+ assert("edward-1213", coord_num_items(&pos->coord) != 0);
+ assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
+ assert("edward-1258", ctail_ok(&pos->coord));
+ assert("edward-261", pos->coord.node != NULL);
+
+ old_nr_items = coord_num_items(&pos->coord);
+ /*
+ * detach old conversion data and
+ * attach a new one, if needed
+ */
+ ret = assign_conversion_mode(pos, &mode);
+ if (ret || mode == CTAIL_INVAL_CONVERT_MODE) {
+ assert("edward-1633", !convert_data_attached(pos));
+ return ret;
+ }
+ /*
+ * find out the status of the right neighbor
+ */
+ ret = pre_convert_ctail(pos);
+ if (ret) {
+ detach_convert_idata(pos->sq);
+ return ret;
+ }
+ ret = do_convert_ctail(pos, mode);
+ if (ret) {
+ detach_convert_idata(pos->sq);
+ return ret;
+ }
+ /*
+ * detach old conversion data if needed
+ */
+ post_convert_ctail(pos, mode, old_nr_items);
+ return 0;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/ctail.h linux-5.10.2/fs/reiser4/plugin/item/ctail.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/ctail.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/ctail.h 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,103 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Ctail items are fragments (or bodies) of special tipe to provide
+ optimal storage of encrypted and(or) compressed files. */
+
+
+#if !defined( __FS_REISER4_CTAIL_H__ )
+#define __FS_REISER4_CTAIL_H__
+
+/* Disk format of ctail item */
+typedef struct ctail_item_format {
+ /* packed shift;
+ if its value is different from UCTAIL_SHIFT (see below), then
+ size of disk cluster is calculated as (1 << cluster_shift) */
+ d8 cluster_shift;
+ /* ctail body */
+ d8 body[0];
+} __attribute__ ((packed)) ctail_item_format;
+
+/* "Unprepped" disk cluster is represented by a single ctail item
+ with the following "magic" attributes: */
+/* "magic" cluster_shift */
+#define UCTAIL_SHIFT 0xff
+/* How many units unprepped ctail item has */
+#define UCTAIL_NR_UNITS 1
+
+/* The following is a set of various item states in a disk cluster.
+ Disk cluster is a set of items whose keys belong to the interval
+ [dc_key , dc_key + disk_cluster_size - 1] */
+typedef enum {
+ DC_INVALID_STATE = 0,
+ DC_FIRST_ITEM = 1,
+ DC_CHAINED_ITEM = 2,
+ DC_AFTER_CLUSTER = 3
+} dc_item_stat;
+
+/* ctail-specific extension.
+ In particular this describes parameters of disk cluster an item belongs to */
+struct ctail_coord_extension {
+ int shift; /* this contains cluster_shift extracted from
+ ctail_item_format (above), or UCTAIL_SHIFT
+ (the last one is the "magic" of unprepped disk clusters)*/
+ int dsize; /* size of a prepped disk cluster */
+ int ncount; /* count of nodes occupied by a disk cluster */
+};
+
+struct cut_list;
+
+/* plugin->item.b.* */
+
+int can_contain_key_ctail(const coord_t *, const reiser4_key *,
+ const reiser4_item_data *);
+int mergeable_ctail(const coord_t * p1, const coord_t * p2);
+pos_in_node_t nr_units_ctail(const coord_t * coord);
+int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
+void print_ctail(const char *prefix, coord_t * coord);
+lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
+
+int paste_ctail(coord_t * coord, reiser4_item_data * data,
+ carry_plugin_info * info UNUSED_ARG);
+int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
+int can_shift_ctail(unsigned free_space, coord_t * coord,
+ znode * target, shift_direction pend, unsigned *size,
+ unsigned want);
+void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
+ unsigned count, shift_direction where_is_free_space,
+ unsigned free_space);
+int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ carry_cut_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ carry_kill_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+int ctail_ok(const coord_t * coord);
+int check_ctail(const coord_t * coord, const char **error);
+
+/* plugin->u.item.s.* */
+int read_ctail(struct file *, flow_t *, hint_t *);
+int readpage_ctail(void *, struct page *);
+int readpages_ctail(struct file *, struct address_space *, struct list_head *);
+reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
+int create_hook_ctail(const coord_t * coord, void *arg);
+int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
+ carry_kill_data *);
+int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
+
+/* plugin->u.item.f */
+int utmost_child_ctail(const coord_t *, sideof, jnode **);
+int scan_ctail(flush_scan *);
+int convert_ctail(flush_pos_t *);
+size_t inode_scaled_cluster_size(struct inode *);
+
+#endif /* __FS_REISER4_CTAIL_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent.c linux-5.10.2/fs/reiser4/plugin/item/extent.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent.c 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,203 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "item.h"
+#include "../../key.h"
+#include "../../super.h"
+#include "../../carry.h"
+#include "../../inode.h"
+#include "../../page_cache.h"
+#include "../../flush.h"
+#include "../object.h"
+
+/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
+/* Audited by: green(2002.06.13) */
+reiser4_item_data *init_new_extent(item_id extent_id, reiser4_item_data *data,
+ void *ext_unit, int nr_extents)
+{
+ data->data = ext_unit;
+ /* data->data is kernel space */
+ data->user = 0;
+ data->length = sizeof(reiser4_extent) * nr_extents;
+ data->arg = NULL;
+ data->iplug = item_plugin_by_id(extent_id);
+ return data;
+}
+
+/* how many bytes are addressed by @nr first extents of the extent item */
+reiser4_block_nr reiser4_extent_size_at(const coord_t * coord, pos_in_node_t nr)
+{
+ pos_in_node_t i;
+ reiser4_block_nr blocks;
+ reiser4_extent *ext;
+
+ ext = item_body_by_coord(coord);
+ assert("vs-263", nr <= nr_units_extent(coord));
+
+ blocks = 0;
+ for (i = 0; i < nr; i++, ext++) {
+ blocks += extent_get_width(ext);
+ }
+
+ return blocks * current_blocksize;
+}
+
+reiser4_block_nr reiser4_extent_size(const coord_t *coord)
+{
+ return reiser4_extent_size_at(coord, nr_units_extent(coord));
+}
+
+extent_state state_of_extent(reiser4_extent * ext)
+{
+ switch ((int)extent_get_start(ext)) {
+ case 0:
+ return HOLE_EXTENT;
+ case 1:
+ return UNALLOCATED_EXTENT;
+ default:
+ break;
+ }
+ return ALLOCATED_EXTENT;
+}
+
+int extent_is_unallocated(const coord_t * item)
+{
+ assert("jmacd-5133", item_is_extent(item));
+
+ return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
+}
+
+/* set extent's start and width */
+void reiser4_set_extent(reiser4_subvol *subv, reiser4_extent *ext,
+ reiser4_block_nr start, reiser4_block_nr width)
+{
+ extent_set_start(subv, ext, start);
+ extent_set_width(subv, ext, width);
+}
+
+/**
+ * replace_extent_unit - overwrite extent unit and paste 1 or 2 after it
+ * @un_extent: coordinate of extent to be overwritten
+ * @lh: need better comment
+ * @key: need better comment
+ * @exts_to_add: data prepared for insertion into tree
+ * @replace: need better comment
+ * @flags: need better comment
+ * @return_insert_position: need better comment
+ *
+ * Overwrites one extent, pastes 1 or 2 more ones after overwritten one. If
+ * @return_inserted_position is 1 - @un_extent and @lh are returned set to
+ * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
+ * set to extent which was overwritten.
+ */
+int replace_extent_unit(item_id extent_id, struct replace_handle *h,
+ int return_inserted_position)
+{
+ int result;
+ znode *orig_znode;
+ /*ON_DEBUG(reiser4_extent orig_ext);*/ /* this is for debugging */
+
+ assert("vs-990", coord_is_existing_unit(h->coord));
+ assert("vs-1375", znode_is_write_locked(h->coord->node));
+ assert("vs-1426", extent_get_width(&h->overwrite) != 0);
+ assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
+ assert("vs-1427", ergo(h->nr_new_extents == 2,
+ extent_get_width(&h->new_extents[1]) != 0));
+
+ /* compose structure for paste */
+ init_new_extent(extent_id, &h->item,
+ &h->new_extents[0], h->nr_new_extents);
+
+ coord_dup(&h->coord_after, h->coord);
+ init_lh(&h->lh_after);
+ copy_lh(&h->lh_after, h->lh);
+ reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
+ reiser4_tap_monitor(&h->watch);
+
+ ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
+ orig_znode = h->coord->node;
+
+#if REISER4_DEBUG
+ /* make sure that key is set properly */
+ unit_key_by_coord(h->coord, &h->tmp);
+ set_key_offset(&h->tmp,
+ get_key_offset(&h->tmp) +
+ extent_get_width(&h->overwrite) * current_blocksize);
+ assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
+#endif
+
+ /* set insert point after unit to be replaced */
+ h->coord->between = AFTER_UNIT;
+
+ result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
+ &h->paste_key, &h->item, h->flags);
+ if (!result) {
+ /* now we have to replace the unit after which new units were
+ inserted. Its position is tracked by @watch */
+ reiser4_extent *ext;
+ znode *node;
+
+ node = h->coord_after.node;
+ if (node != orig_znode) {
+ coord_clear_iplug(&h->coord_after);
+ result = zload(node);
+ }
+
+ if (likely(!result)) {
+ ext = extent_by_coord(&h->coord_after);
+
+ assert("vs-987", znode_is_loaded(node));
+ assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
+
+ /* overwrite extent unit */
+ memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
+ znode_make_dirty(node);
+
+ if (node != orig_znode)
+ zrelse(node);
+
+ if (return_inserted_position == 0) {
+ /* coord and lh are to be set to overwritten
+ extent */
+ assert("vs-1662",
+ WITH_DATA(node, !memcmp(&h->overwrite,
+ extent_by_coord(
+ &h->coord_after),
+ sizeof(reiser4_extent))));
+
+ *h->coord = h->coord_after;
+ done_lh(h->lh);
+ copy_lh(h->lh, &h->lh_after);
+ } else {
+ /* h->coord and h->lh are to be set to first of
+ inserted units */
+ assert("vs-1663",
+ WITH_DATA(h->coord->node,
+ !memcmp(&h->new_extents[0],
+ extent_by_coord(h->coord),
+ sizeof(reiser4_extent))));
+ assert("vs-1664", h->lh->node == h->coord->node);
+ }
+ }
+ }
+ reiser4_tap_done(&h->watch);
+
+ return result;
+}
+
+lock_handle *znode_lh(znode *node)
+{
+ assert("vs-1371", znode_is_write_locked(node));
+ assert("vs-1372", znode_is_wlocked_once(node));
+ return list_entry(node->lock.owners.next, lock_handle, owners_link);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-5.10.2/fs/reiser4/plugin/item/extent_file_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent_file_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent_file_ops.c 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,1461 @@
+/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "item.h"
+#include "../../inode.h"
+#include "../../page_cache.h"
+#include "../object.h"
+#include "../volume/volume.h"
+#include <linux/swap.h>
+
+static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
+{
+ reiser4_extent *ext;
+
+ ext = (reiser4_extent *) (zdata(node) + offset);
+ return ext;
+}
+
+static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
+{
+ return ext_by_offset(uf_coord->coord.node,
+ uf_coord->extension.extent.ext_offset);
+}
+
+/**
+ * verify coord extension @uf_coord against @key:
+ *
+ * Makes sure that all fields of @uf_coord are set properly.
+ * If @key is specified - check whether @uf_coord is set correspondingly.
+ */
+void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
+{
+#if REISER4_DEBUG
+ const coord_t *coord;
+ const struct extent_coord_extension *ext_coord;
+ reiser4_extent *ext;
+ reiser4_key coord_key;
+
+ coord = &uf_coord->coord;
+ unit_key_by_coord(coord, &coord_key);
+
+ ext_coord = &uf_coord->extension.extent;
+ ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
+
+ assert("edward-2047",
+ WITH_DATA(coord->node,
+ (uf_coord->valid == 1 &&
+ coord_is_iplug_set(coord) &&
+ item_is_extent(coord) &&
+ ext_coord->nr_units == nr_units_extent(coord) &&
+ ext == extent_by_coord(coord) &&
+ ext_coord->width == extent_get_width(ext) &&
+ coord->unit_pos < ext_coord->nr_units &&
+ ext_coord->pos_in_unit < ext_coord->width &&
+ memcmp(ext, &ext_coord->extent,
+ sizeof(reiser4_extent)) == 0)));
+ if (key) {
+ set_key_offset(&coord_key,
+ get_key_offset(&coord_key) +
+ (uf_coord->extension.extent.
+ pos_in_unit << PAGE_SHIFT));
+ set_key_ordering(&coord_key, get_key_ordering(key));
+ assert("edward-2326", keyeq(key, &coord_key));
+ }
+#endif
+}
+
+#if REISER4_DEBUG
+
+/**
+ * return 1 if offset @off is inside of extent unit pointed to by @coord.
+ * Set pos_in_unit inside of unit correspondingly
+ */
+static int offset_is_in_unit(const coord_t *coord, loff_t off)
+{
+ reiser4_key unit_key;
+ __u64 unit_off;
+ reiser4_extent *ext;
+
+ ext = extent_by_coord(coord);
+
+ unit_key_extent(coord, &unit_key);
+ unit_off = get_key_offset(&unit_key);
+ if (off < unit_off)
+ return 0;
+ if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
+ return 0;
+ return 1;
+}
+
+static int coord_matches_key_extent(struct inode *inode,
+ const coord_t *coord,
+ const reiser4_key *key)
+{
+ reiser4_key item_key;
+ item_plugin *iplug;
+
+ assert("vs-771", coord_is_existing_unit(coord));
+ assert("edward-2090", item_is_extent(coord));
+
+ iplug = item_plugin_by_coord(coord);
+ /*
+ * check that in simple volumes logical order coincides with
+ * physical order
+ */
+ assert("vs-1258",
+ ergo(current_vol_plug() ==
+ volume_plugin_by_id(SIMPLE_VOLUME_ID),
+ keylt(key, iplug->s.file.append_key(coord, &item_key))));
+ assert("vs-1259",
+ ergo(current_vol_plug() ==
+ volume_plugin_by_id(SIMPLE_VOLUME_ID),
+ keyge(key, item_key_by_coord(coord, &item_key))));
+
+ return offset_is_in_unit(coord, get_key_offset(key));
+}
+#endif
+
+static int can_append(const reiser4_key *key, const coord_t *coord)
+{
+ reiser4_key append_key;
+
+ return keyeq(key, append_key_extent(coord, &append_key));
+}
+
+static int append_hole_unix_file(coord_t *coord, lock_handle *lh,
+ const reiser4_key *key)
+{
+ reiser4_key append_key;
+ reiser4_block_nr hole_width;
+ reiser4_extent *ext, new_ext;
+ reiser4_item_data idata;
+
+ /* last item of file may have to be appended with hole */
+ assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
+ assert("vs-714", item_id_by_coord(coord) == EXTENT40_POINTER_ID);
+ /*
+ * construct key of first byte which is not addressed by the
+ * last extent
+ */
+ append_key_extent(coord, &append_key);
+ assert("edward-2324", keyle(&append_key, key));
+ /*
+ * extent item has to be appended with hole. Calculate length of that
+ * hole
+ */
+ hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
+ current_blocksize - 1) >> current_blocksize_bits);
+ assert("vs-954", hole_width > 0);
+
+ /* set coord after last unit */
+ coord_init_after_item_end(coord);
+
+ /* get last extent in the item */
+ ext = extent_by_coord(coord);
+ if (state_of_extent(ext) == HOLE_EXTENT) {
+ /*
+ * last extent of a file is hole extent. Widen that extent by
+ * @hole_width blocks. Note that we do not worry about
+ * overflowing - extent width is 64 bits
+ */
+ reiser4_set_extent(get_meta_subvol(), ext, HOLE_EXTENT_START,
+ extent_get_width(ext) + hole_width);
+ znode_make_dirty(coord->node);
+ return 0;
+ }
+ /*
+ * append last item of the file with hole extent unit
+ */
+ assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
+ state_of_extent(ext) == UNALLOCATED_EXTENT));
+
+ reiser4_set_extent(get_meta_subvol(), &new_ext,
+ HOLE_EXTENT_START, hole_width);
+ init_new_extent(EXTENT40_POINTER_ID, &idata, &new_ext, 1);
+ return insert_into_item(coord, lh, &append_key, &idata, 0);
+}
+
+/**
+ * @twig: longterm locked twig node
+ */
+void check_jnodes(znode *twig, const reiser4_key *key, int count)
+{
+ coord_t coord;
+ reiser4_key node_key, jnode_key;
+
+ if (current_vol_plug() != volume_plugin_by_id(SIMPLE_VOLUME_ID))
+ return;
+
+ jnode_key = *key;
+
+ assert("edward-2094", twig != NULL);
+ assert("edward-2095", znode_get_level(twig) == TWIG_LEVEL);
+ assert("edward-2096", znode_is_write_locked(twig));
+
+ zload(twig);
+ /* get the smallest key in twig node */
+ coord_init_first_unit(&coord, twig);
+ assert("edward-2097",
+ item_is_extent(&coord) || item_is_internal(&coord));
+
+ unit_key_by_coord(&coord, &node_key);
+ assert("edward-2098", keyle(&node_key, &jnode_key));
+
+ /* get the greatest key in the twig node */
+ coord_init_last_unit(&coord, twig);
+ assert("edward-2099",
+ item_is_extent(&coord) || item_is_internal(&coord));
+
+ unit_key_by_coord(&coord, &node_key);
+
+ if (item_is_extent(&coord))
+ item_plugin_by_coord(&coord)->s.file.append_key(&coord,
+ &node_key);
+ set_key_offset(&jnode_key,
+ get_key_offset(&jnode_key) +
+ (loff_t)count * PAGE_SIZE - 1);
+ assert("edward-2100", keylt(&jnode_key, &node_key));
+ zrelse(twig);
+}
+
+/**
+ * append last file item
+ * @uf_coord: coord to start insertion from
+ * @jnodes: array of jnodes
+ * @count: number of jnodes in the array
+ *
+ * There is already at least one extent item of file @inode in the tree.
+ * Append the last of them with unallocated extent unit of width @count.
+ * Assign fake block numbers to jnodes corresponding to the inserted extent.
+ */
+static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
+ jnode **jnodes, int count)
+{
+ int result;
+ reiser4_extent new_ext;
+ reiser4_item_data idata;
+ coord_t *coord;
+ struct extent_coord_extension *ext_coord;
+ reiser4_extent *ext;
+ reiser4_block_nr block;
+ jnode *node;
+ int i;
+ struct atom_brick_info *abi;
+
+ coord = &uf_coord->coord;
+ ext_coord = &uf_coord->extension.extent;
+ ext = ext_by_ext_coord(uf_coord);
+
+ /* check correctness of position in the item */
+ assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
+ assert("vs-1311", coord->between == AFTER_UNIT);
+ assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
+
+ if (!can_append(key, coord)) {
+ /* hole extent has to be inserted */
+ result = append_hole_unix_file(coord, uf_coord->lh, key);
+ uf_coord->valid = 0;
+ return result;
+ }
+ if (count == 0)
+ return 0;
+ assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_SIZE);
+
+ inode_add_blocks(mapping_jnode(jnodes[0])->host, count);
+
+ switch (state_of_extent(ext)) {
+ case UNALLOCATED_EXTENT:
+ /*
+ * last extent unit of the file is unallocated one
+ * Increase its width by @count
+ */
+ reiser4_set_extent(get_meta_subvol(), ext,
+ UNALLOCATED_EXTENT_START,
+ extent_get_width(ext) + count);
+ znode_make_dirty(coord->node);
+
+ /* update coord extension */
+ ext_coord->width += count;
+ ext_coord->pos_in_unit += count;
+ ON_DEBUG(extent_set_width(get_meta_subvol(),
+ &uf_coord->extension.extent.extent,
+ ext_coord->width));
+ break;
+ case HOLE_EXTENT:
+ case ALLOCATED_EXTENT:
+ /*
+ * last extent unit of the file is either hole or allocated
+ * one. Append one unallocated extent of width @count
+ */
+ reiser4_set_extent(get_meta_subvol(), &new_ext,
+ UNALLOCATED_EXTENT_START, count);
+ init_new_extent(EXTENT40_POINTER_ID, &idata, &new_ext, 1);
+ result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
+ uf_coord->valid = 0;
+ if (result)
+ return result;
+ break;
+
+ default:
+ return RETERR(-EIO);
+ }
+ /*
+ * make sure that we hold long term locked twig node containing all
+ * jnodes we are about to capture
+ */
+ ON_DEBUG(check_jnodes(uf_coord->lh->node, key, count));
+ /*
+ * assign fake block numbers to all jnodes. FIXME: make sure whether
+ * twig node containing inserted extent item is locked
+ */
+ result = check_insert_atom_brick_info(get_meta_subvol()->id, &abi);
+ if (result)
+ return result;
+
+ for (i = 0; i < count; i ++) {
+ node = jnodes[i];
+ block = fake_blocknr_unformatted(1, get_meta_subvol());
+ spin_lock_jnode(node);
+ JF_SET(node, JNODE_CREATED);
+
+ jnode_set_subvol(node, get_meta_subvol());
+ jnode_set_block(node, &block);
+
+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(result != 0);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+ }
+ return count;
+}
+
+static int insert_first_hole(coord_t *coord, lock_handle *lh,
+ const reiser4_key *key)
+{
+ reiser4_extent new_ext;
+ reiser4_item_data idata;
+ reiser4_key item_key;
+ reiser4_block_nr hole_width;
+
+ /* @coord must be set for inserting of new item */
+ assert("vs-711", coord_is_between_items(coord));
+
+ item_key = *key;
+ set_key_offset(&item_key, 0ull);
+
+ hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
+ current_blocksize_bits);
+ assert("vs-710", hole_width > 0);
+ /*
+ * compose body of hole extent and insert item into tree
+ */
+ reiser4_set_extent(get_meta_subvol(), &new_ext,
+ HOLE_EXTENT_START, hole_width);
+ init_new_extent(EXTENT40_POINTER_ID, &idata, &new_ext, 1);
+ return insert_extent_by_coord(coord, &idata, &item_key, lh);
+}
+
+
+/**
+ * insert first file item
+ * @inode: inode of file
+ * @uf_coord: coord to start insertion from
+ * @jnodes: array of jnodes
+ * @count: number of jnodes in the array
+ * @inode:
+ *
+ * There are no items of file @inode in the tree yet. Insert unallocated extent
+ * of width @count into tree or hole extent if writing not to the
+ * beginning. Assign fake block numbers to jnodes corresponding to the inserted
+ * unallocated extent. Returns number of jnodes or error code.
+ */
+static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
+ jnode **jnodes, int count, struct inode *inode)
+{
+ int result;
+ int i;
+ reiser4_extent new_ext;
+ reiser4_item_data idata;
+ reiser4_block_nr block;
+ struct unix_file_info *uf_info;
+ jnode *node;
+ struct atom_brick_info *abi;
+
+ /* first extent insertion starts at leaf level */
+ assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
+ assert("vs-711", coord_is_between_items(&uf_coord->coord));
+
+ if (get_key_offset(key) != 0) {
+ result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
+ uf_coord->valid = 0;
+ uf_info = unix_file_inode_data(inode);
+
+ /*
+ * first item insertion is only possible when writing to empty
+ * file or performing tail conversion
+ */
+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
+ (reiser4_inode_get_flag(inode,
+ REISER4_PART_MIXED) &&
+ reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV))));
+
+ /* if file was empty - update its state */
+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
+ uf_info->container = UF_CONTAINER_EXTENTS;
+ return result;
+ }
+ if (count == 0)
+ return 0;
+
+ inode_add_blocks(mapping_jnode(jnodes[0])->host, count);
+
+ /*
+ * prepare for tree modification: compose body of item and item data
+ * structure needed for insertion
+ */
+ reiser4_set_extent(get_meta_subvol(), &new_ext,
+ UNALLOCATED_EXTENT_START, count);
+ init_new_extent(EXTENT40_POINTER_ID, &idata, &new_ext, 1);
+
+ /* insert extent item into the tree */
+ result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
+ uf_coord->lh);
+ if (result)
+ return result;
+
+ /*
+ * make sure that we hold long term locked twig node containing all
+ * jnodes we are about to capture
+ */
+ ON_DEBUG(check_jnodes(uf_coord->lh->node, key, count));
+ /*
+ * assign fake block numbers to all jnodes, capture and mark them dirty
+ */
+ result = check_insert_atom_brick_info(get_meta_subvol()->id, &abi);
+ if (result)
+ return result;
+
+ block = fake_blocknr_unformatted(count, get_meta_subvol());
+ for (i = 0; i < count; i ++, block ++) {
+ node = jnodes[i];
+ spin_lock_jnode(node);
+ JF_SET(node, JNODE_CREATED);
+ /*
+ * unix file plugin stores everything in meta-data brick
+ */
+ jnode_set_subvol(node, get_meta_subvol());
+ jnode_set_block(node, &block);
+
+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(result != 0);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+ }
+
+ /*
+ * invalidate coordinate, research must be performed to continue
+ * because write will continue on twig level
+ */
+ uf_coord->valid = 0;
+ return count;
+}
+
+/**
+ * replace hole extent with unallocated and holes
+ * @uf_coord:
+ * @key:
+ * @node:
+ * @h: structure containing coordinate, lock handle, key, etc
+ *
+ * Creates an unallocated extent of width 1 within a hole. In worst case two
+ * additional extents can be created.
+ */
+static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
+{
+ struct replace_handle rh;
+ reiser4_extent *ext;
+ reiser4_block_nr width, pos_in_unit;
+ coord_t *coord;
+ struct extent_coord_extension *ext_coord;
+ int return_inserted_position;
+
+ check_uf_coord(uf_coord, key);
+
+ rh.coord = coord_by_uf_coord(uf_coord);
+ rh.lh = uf_coord->lh;
+ rh.flags = 0;
+
+ coord = coord_by_uf_coord(uf_coord);
+ ext_coord = ext_coord_by_uf_coord(uf_coord);
+ ext = ext_by_ext_coord(uf_coord);
+
+ width = ext_coord->width;
+ pos_in_unit = ext_coord->pos_in_unit;
+
+ *how = 0;
+ if (width == 1) {
+ reiser4_set_extent(get_meta_subvol(), ext,
+ UNALLOCATED_EXTENT_START, 1);
+ znode_make_dirty(coord->node);
+ /* update uf_coord */
+ ON_DEBUG(ext_coord->extent = *ext);
+ *how = 1;
+ return 0;
+ } else if (pos_in_unit == 0) {
+ /* we deal with first element of extent */
+ if (coord->unit_pos) {
+ /* there is an extent to the left */
+ if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
+ /*
+ * left neighboring unit is an unallocated
+ * extent. Increase its width and decrease
+ * width of hole
+ */
+ extent_set_width(get_meta_subvol(), ext - 1,
+ extent_get_width(ext - 1) + 1);
+ extent_set_width(get_meta_subvol(), ext,
+ width - 1);
+ znode_make_dirty(coord->node);
+
+ /* update coord extension */
+ coord->unit_pos--;
+ ext_coord->width = extent_get_width(ext - 1);
+ ext_coord->pos_in_unit = ext_coord->width - 1;
+ ext_coord->ext_offset -= sizeof(reiser4_extent);
+ ON_DEBUG(ext_coord->extent =
+ *extent_by_coord(coord));
+ *how = 2;
+ return 0;
+ }
+ }
+ /* extent for replace */
+ reiser4_set_extent(get_meta_subvol(), &rh.overwrite,
+ UNALLOCATED_EXTENT_START, 1);
+ /* extent to be inserted */
+ reiser4_set_extent(get_meta_subvol(), &rh.new_extents[0],
+ HOLE_EXTENT_START,
+ width - 1);
+ rh.nr_new_extents = 1;
+
+ /* have replace_extent_unit() to return with @coord and
+ @uf_coord->lh set to unit which was replaced */
+ return_inserted_position = 0;
+ *how = 3;
+ } else if (pos_in_unit == width - 1) {
+ /* we deal with last element of extent */
+ if (coord->unit_pos < nr_units_extent(coord) - 1) {
+ /* there is an extent unit to the right */
+ if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
+ /*
+ * right neighboring unit is an unallocated
+ * extent. Increase its width and decrease
+ * width of hole
+ */
+ extent_set_width(get_meta_subvol(),
+ ext + 1,
+ extent_get_width(ext + 1) + 1);
+ extent_set_width(get_meta_subvol(),
+ ext, width - 1);
+ znode_make_dirty(coord->node);
+
+ /* update coord extension */
+ coord->unit_pos++;
+ ext_coord->width = extent_get_width(ext + 1);
+ ext_coord->pos_in_unit = 0;
+ ext_coord->ext_offset += sizeof(reiser4_extent);
+ ON_DEBUG(ext_coord->extent =
+ *extent_by_coord(coord));
+ *how = 4;
+ return 0;
+ }
+ }
+ /* extent for replace */
+ reiser4_set_extent(get_meta_subvol(), &rh.overwrite,
+ HOLE_EXTENT_START, width - 1);
+ /* extent to be inserted */
+ reiser4_set_extent(get_meta_subvol(), &rh.new_extents[0],
+ UNALLOCATED_EXTENT_START, 1);
+ rh.nr_new_extents = 1;
+
+ /* have replace_extent_unit() to return with @coord and
+ @uf_coord->lh set to unit which was inserted */
+ return_inserted_position = 1;
+ *how = 5;
+ } else {
+ /* extent for replace */
+ reiser4_set_extent(get_meta_subvol(), &rh.overwrite,
+ HOLE_EXTENT_START, pos_in_unit);
+ /* extents to be inserted */
+ reiser4_set_extent(get_meta_subvol(), &rh.new_extents[0],
+ UNALLOCATED_EXTENT_START, 1);
+ reiser4_set_extent(get_meta_subvol(), &rh.new_extents[1],
+ HOLE_EXTENT_START, width - pos_in_unit - 1);
+ rh.nr_new_extents = 2;
+
+ /* have replace_extent_unit() to return with @coord and
+ @uf_coord->lh set to first of units which were inserted */
+ return_inserted_position = 1;
+ *how = 6;
+ }
+ unit_key_by_coord(coord, &rh.paste_key);
+ set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
+ extent_get_width(&rh.overwrite) * current_blocksize);
+
+ uf_coord->valid = 0;
+ return replace_extent_unit(EXTENT40_POINTER_ID,
+ &rh, return_inserted_position);
+}
+
+/**
+ * If @node corresponds to hole extent - create unallocated extent for it and
+ * assign fake block number. If @node corresponds to allocated extent - assign
+ * block number of jnode
+ */
+static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
+ jnode *node, int *hole_plugged)
+{
+ int result;
+ struct extent_coord_extension *ext_coord;
+ reiser4_extent *ext;
+ reiser4_block_nr block;
+ int how;
+
+ assert("vs-1312", uf_coord->coord.between == AT_UNIT);
+
+ result = 0;
+ ext_coord = ext_coord_by_uf_coord(uf_coord);
+ check_uf_coord(uf_coord, NULL);
+ ext = ext_by_ext_coord(uf_coord);
+ assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
+
+ switch (state_of_extent(ext)) {
+ case ALLOCATED_EXTENT:
+ assert("edward-2215", node->subvol == get_meta_subvol());
+
+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
+ break;
+
+ case HOLE_EXTENT:
+ assert("edward-2216", node->subvol == NULL);
+
+ inode_add_blocks(mapping_jnode(node)->host, 1);
+ result = plug_hole(uf_coord, key, &how);
+ if (result)
+ return result;
+ block = fake_blocknr_unformatted(1, get_meta_subvol());
+ if (hole_plugged)
+ *hole_plugged = 1;
+ JF_SET(node, JNODE_CREATED);
+ jnode_set_subvol(node, get_meta_subvol());
+ break;
+
+ default:
+ return RETERR(-EIO);
+ }
+
+ jnode_set_block(node, &block);
+ return 0;
+}
+
+/**
+ * move_coord - move coordinate forward
+ * @uf_coord:
+ *
+ * Move coordinate one data block pointer forward. Return 1 if coord is set to
+ * the last one already or is invalid.
+ */
+static int move_coord(uf_coord_t *uf_coord)
+{
+ struct extent_coord_extension *ext_coord;
+
+ if (uf_coord->valid == 0)
+ return 1;
+ ext_coord = &uf_coord->extension.extent;
+ ext_coord->pos_in_unit ++;
+ if (ext_coord->pos_in_unit < ext_coord->width)
+ /* coordinate moved within the unit */
+ return 0;
+
+ /* end of unit is reached. Try to move to next unit */
+ ext_coord->pos_in_unit = 0;
+ uf_coord->coord.unit_pos ++;
+ if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
+ /* coordinate moved to next unit */
+ ext_coord->ext_offset += sizeof(reiser4_extent);
+ ext_coord->width =
+ extent_get_width(ext_by_offset
+ (uf_coord->coord.node,
+ ext_coord->ext_offset));
+ ON_DEBUG(ext_coord->extent =
+ *ext_by_offset(uf_coord->coord.node,
+ ext_coord->ext_offset));
+ return 0;
+ }
+ /* end of item is reached */
+ uf_coord->valid = 0;
+ return 1;
+}
+
+/**
+ * Process @count logical blocks of a file.
+ * Returns number of handled jnodes.
+ */
+static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
+ jnode **jnodes, int count, int *plugged_hole)
+{
+ int result;
+ reiser4_key k;
+ int i;
+ jnode *node;
+ struct atom_brick_info *abi;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ result = check_insert_atom_brick_info(subv->id, &abi);
+ if (result)
+ return result;
+
+ k = *key;
+ for (i = 0; i < count; i ++) {
+ node = jnodes[i];
+ if (*jnode_get_block(node) == 0) {
+ result = overwrite_one_block(uf_coord, &k,
+ node, plugged_hole);
+ if (result)
+ return result;
+ }
+ /*
+ * make sure that we hold long term locked twig node containing
+ * all jnodes we are about to capture
+ */
+ ON_DEBUG(check_jnodes(uf_coord->lh->node, &k, 1));
+ /*
+ * assign fake block numbers to all jnodes, capture and mark
+ * them dirty
+ */
+ spin_lock_jnode(node);
+ result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(result != 0);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+
+ if (uf_coord->valid == 0)
+ return i + 1;
+
+ check_uf_coord(uf_coord, &k);
+
+ if (move_coord(uf_coord)) {
+ /*
+ * failed to move to the next node pointer. Either end
+ * of file or end of twig node is reached. In the later
+ * case we might go to the right neighbor.
+ */
+ uf_coord->valid = 0;
+ return i + 1;
+ }
+ /* update key for next iteration */
+
+ set_key_offset(&k, get_key_offset(&k) + PAGE_SIZE);
+ }
+ return count;
+}
+
+int update_extent_unix_file(struct inode *inode, jnode *node,
+ loff_t pos, int *plugged_hole)
+{
+ int result;
+ znode *loaded;
+ uf_coord_t uf_coord;
+ coord_t *coord;
+ lock_handle lh;
+ reiser4_key key;
+
+ assert("", reiser4_lock_counters()->d_refs == 0);
+
+ build_body_key_unix_file(inode, pos, &key);
+
+ init_uf_coord(&uf_coord, &lh);
+ coord = &uf_coord.coord;
+ result = find_file_item_nohint(coord, &lh, &key,
+ ZNODE_WRITE_LOCK, inode);
+ if (IS_CBKERR(result)) {
+ assert("", reiser4_lock_counters()->d_refs == 0);
+ return result;
+ }
+
+ result = zload(coord->node);
+ BUG_ON(result != 0);
+ loaded = coord->node;
+
+ if (coord->between == AFTER_UNIT) {
+ /*
+ * append existing extent item with unallocated extent
+ */
+ init_coord_extension_extent(&uf_coord,
+ get_key_offset(&key));
+ result = append_last_extent(&uf_coord, &key, &node, 1);
+ } else if (coord->between == AT_UNIT) {
+ /*
+ * overwrite existing extent
+ * FIXME: not optimal yet. Will be optimized if new
+ * write will show performance win.
+ */
+ init_coord_extension_extent(&uf_coord,
+ get_key_offset(&key));
+ result = overwrite_extent(&uf_coord, &key,
+ &node, 1, plugged_hole);
+ } else {
+ /*
+ * there are no items of this file in the tree yet.
+ * Create first item of the file inserting one
+ * unallocated extent
+ */
+ result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
+ }
+ assert("edward-2048", result == 1 || result < 0);
+
+ zrelse(loaded);
+ done_lh(&lh);
+ assert("edward-2049", reiser4_lock_counters()->d_refs == 0);
+
+ return (result == 1) ? 0 : result;
+}
+
+static int update_extents_unix_file(struct file *file, struct inode *inode,
+ jnode **jnodes, int count, loff_t pos)
+{
+ struct hint hint;
+ reiser4_key key;
+ int result;
+ znode *loaded;
+
+ result = load_file_hint(file, &hint);
+ BUG_ON(result != 0);
+
+ if (count != 0)
+ /*
+ * count == 0 is special case: expanding truncate
+ */
+ pos = (loff_t)index_jnode(jnodes[0]) << PAGE_SHIFT;
+ build_body_key_unix_file(inode, pos, &key);
+
+ assert("", reiser4_lock_counters()->d_refs == 0);
+
+ do {
+ result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
+ if (IS_CBKERR(result)) {
+ assert("", reiser4_lock_counters()->d_refs == 0);
+ return result;
+ }
+
+ result = zload(hint.ext_coord.coord.node);
+ BUG_ON(result != 0);
+ loaded = hint.ext_coord.coord.node;
+
+ if (hint.ext_coord.coord.between == AFTER_UNIT) {
+ /*
+ * append existing extent item with unallocated extent
+ * of width nr_jnodes
+ */
+ if (hint.ext_coord.valid == 0)
+ /* NOTE: get statistics on this */
+ init_coord_extension_extent(&hint.ext_coord,
+ get_key_offset(&key));
+ result = append_last_extent(&hint.ext_coord,
+ &key, jnodes, count);
+ } else if (hint.ext_coord.coord.between == AT_UNIT) {
+ /*
+ * overwrite
+ * not optimal yet. Will be optimized if new write will
+ * show performance win.
+ */
+ if (hint.ext_coord.valid == 0)
+ /* NOTE: get statistics on this */
+ init_coord_extension_extent(&hint.ext_coord,
+ get_key_offset(&key));
+ result = overwrite_extent(&hint.ext_coord, &key,
+ jnodes, count, NULL);
+ } else {
+ /*
+ * there are no items of this file in the tree
+ * yet. Create first item of the file inserting one
+ * unallocated extent of * width nr_jnodes
+ */
+ result = insert_first_extent(&hint.ext_coord, &key,
+ jnodes, count, inode);
+ }
+ zrelse(loaded);
+ if (result < 0) {
+ done_lh(hint.ext_coord.lh);
+ break;
+ }
+
+ jnodes += result;
+ count -= result;
+ set_key_offset(&key, get_key_offset(&key) + result * PAGE_SIZE);
+
+ /* seal and unlock znode */
+ if (hint.ext_coord.valid)
+ reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
+ else
+ reiser4_unset_hint(&hint);
+
+ } while (count > 0);
+
+ save_file_hint(file, &hint);
+ assert("", reiser4_lock_counters()->d_refs == 0);
+ return result;
+}
+
+/**
+ * Estimate and reserve space for extent write operation
+ * @inode: inode of the file to write to;
+ * @offset: write position;
+ * @count: number of written pages.
+ */
+static int reserve_write_extent(struct inode *inode, int count)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+ reiser4_tree *tree = &subv->tree;
+ /*
+ * to write @count pages to a file by extents we have to reserve disk
+ * space for:
+ *
+ * 1. find_file_item() may have to insert empty node to the tree
+ * (empty leaf node between two extent items). This requires:
+ * (a) 1 block for the leaf node;
+ * (b) number of formatted blocks which are necessary to perform
+ * insertion of an internal item into twig level.
+ *
+ * 2. for each of written pages there might be needed:
+ * (a) 1 unformatted block for the page itself;
+ * (b) number of blocks which might be necessary to insert or
+ * paste to an extent item.
+ *
+ * 3. stat data update
+ */
+ grab_space_enable();
+ return reiser4_grab_space(count /* for 2(a) */ +
+ estimate_one_insert_item(tree) +
+ count * estimate_one_insert_into_item(tree) +
+ estimate_one_insert_item(tree), BA_CAN_COMMIT, subv);
+}
+
+/*
+ * filemap_copy_from_user no longer exists in generic code, because it
+ * is deadlocky (copying from user while holding the page lock is bad).
+ * As a temporary fix for reiser4, just define it here.
+ */
+size_t filemap_copy_from_user(struct page *page, unsigned long offset,
+ const char __user *buf, unsigned bytes)
+{
+ char *kaddr;
+ int left;
+
+ kaddr = kmap_atomic(page);
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ kunmap_atomic(kaddr);
+
+ if (left != 0) {
+ /* Do it the slow way */
+ kaddr = kmap(page);
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ kunmap(page);
+ }
+ return bytes - left;
+}
+
+/**
+ * @file: file to write to
+ * @buf: address of user-space buffer
+ * @count: number of bytes to write
+ * @pos: position in file to write to
+ */
+ssize_t write_extent_unix_file(struct file *file, struct inode *inode,
+ const char __user *buf, size_t count,
+ loff_t *pos)
+{
+ int have_to_update_extent;
+ int nr_pages;
+ int nr_dirty = 0;
+ struct page *page;
+ jnode *jnodes[DEFAULT_WRITE_GRANULARITY + 1];
+ unsigned long index;
+ unsigned long end;
+ int i;
+ int to_page, page_off;
+ size_t written;
+ size_t left = count;
+ int result = 0;
+ /*
+ * calculate number of pages which are to be written
+ */
+ index = *pos >> PAGE_SHIFT;
+ end = ((*pos + count - 1) >> PAGE_SHIFT);
+ nr_pages = end - index + 1;
+ assert("edward-2293", nr_pages <= DEFAULT_WRITE_GRANULARITY + 1);
+
+ if (reserve_write_extent(inode, nr_pages))
+ return RETERR(-ENOSPC);
+
+ if (count == 0) {
+ /* case of expanding truncate */
+ update_extents_unix_file(file, inode, jnodes, 0, *pos);
+ return 0;
+ }
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ /* get pages and jnodes */
+ for (i = 0; i < nr_pages; i ++) {
+ page = find_or_create_page(inode->i_mapping, index + i,
+ reiser4_ctx_gfp_mask_get());
+ if (page == NULL) {
+ nr_pages = i;
+ result = RETERR(-ENOMEM);
+ goto out;
+ }
+ jnodes[i] = jnode_of_page(page);
+ if (IS_ERR(jnodes[i])) {
+ unlock_page(page);
+ put_page(page);
+ nr_pages = i;
+ result = RETERR(-ENOMEM);
+ goto out;
+ }
+ /* prevent jnode and page from disconnecting */
+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
+ unlock_page(page);
+ }
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ have_to_update_extent = 0;
+
+ page_off = (*pos & (PAGE_SIZE - 1));
+ for (i = 0; i < nr_pages; i ++) {
+ to_page = PAGE_SIZE - page_off;
+ if (to_page > left)
+ to_page = left;
+ page = jnode_page(jnodes[i]);
+ if (page_offset(page) < inode->i_size &&
+ !PageUptodate(page) && to_page != PAGE_SIZE) {
+ /*
+ * the above is not optimal for partial write to last
+ * page of file when file size is not at boundary of
+ * page
+ */
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ result = readpage_unix_file(NULL, page);
+ assert("edward-2050", result == 0);
+ BUG_ON(result != 0);
+ /* wait for read completion */
+ lock_page(page);
+ BUG_ON(!PageUptodate(page));
+ } else
+ result = 0;
+ unlock_page(page);
+ }
+
+ BUG_ON(get_current_context()->trans->atom != NULL);
+ fault_in_pages_readable(buf, to_page);
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ lock_page(page);
+ if (!PageUptodate(page) && to_page != PAGE_SIZE)
+ zero_user_segments(page, 0, page_off,
+ page_off + to_page,
+ PAGE_SIZE);
+
+ written = filemap_copy_from_user(page, page_off, buf, to_page);
+ if (unlikely(written != to_page)) {
+ unlock_page(page);
+ result = RETERR(-EFAULT);
+ break;
+ }
+
+ flush_dcache_page(page);
+ set_page_dirty_notag(page);
+ unlock_page(page);
+ nr_dirty++;
+
+ mark_page_accessed(page);
+ SetPageUptodate(page);
+
+ if (jnodes[i]->blocknr == 0)
+ have_to_update_extent ++;
+
+ page_off = 0;
+ buf += to_page;
+ left -= to_page;
+ BUG_ON(get_current_context()->trans->atom != NULL);
+ }
+ if (have_to_update_extent) {
+ result = update_extents_unix_file(file, inode,
+ jnodes, nr_dirty, *pos);
+
+ assert("edward-2278", result == -ENOSPC || result >= 0);
+ if (result < 0)
+ goto out;
+ } else {
+ for (i = 0; i < nr_dirty; i ++) {
+ struct atom_brick_info *abi;
+ assert("edward-1983", jnodes[i]->subvol != NULL);
+
+ spin_lock_jnode(jnodes[i]);
+ result = reiser4_try_capture(jnodes[i],
+ ZNODE_WRITE_LOCK, 0);
+ spin_unlock_jnode(jnodes[i]);
+ BUG_ON(result != 0);
+
+ result = check_insert_atom_brick_info(jnodes[i]->subvol->id,
+ &abi);
+ if (result)
+ goto out;
+
+ spin_lock_jnode(jnodes[i]);
+ jnode_make_dirty_locked(jnodes[i]);
+ spin_unlock_jnode(jnodes[i]);
+ }
+ }
+ out:
+ for (i = 0; i < nr_pages; i ++) {
+ put_page(jnode_page(jnodes[i]));
+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
+ jput(jnodes[i]);
+ }
+ /*
+ * the only errors handled so far is ENOMEM and
+ * EFAULT on copy_from_user
+ */
+ return (count - left) ? (count - left) : result;
+}
+
+int __reiser4_readpage_extent(const coord_t *coord,
+ reiser4_extent *ext, reiser4_block_nr pos,
+ struct page *page)
+{
+ jnode *j;
+ struct address_space *mapping;
+ unsigned long index;
+ oid_t oid;
+ int state;
+ reiser4_block_nr block;
+
+ mapping = page->mapping;
+ oid = get_inode_oid(mapping->host);
+ index = page->index;
+ state = (ext != NULL ? state_of_extent(ext) : HOLE_EXTENT);
+
+ switch (state) {
+ case HOLE_EXTENT:
+ j = jfind(mapping, index);
+ if (j == NULL) {
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+ spin_lock_jnode(j);
+ if (!jnode_page(j)) {
+ jnode_attach_page(j, page);
+ } else {
+ BUG_ON(jnode_page(j) != page);
+ assert("vs-1504", jnode_page(j) == page);
+ }
+ block = *jnode_get_io_block(j);
+ spin_unlock_jnode(j);
+ if (block == 0) {
+ zero_user(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ jput(j);
+ return 0;
+ }
+ /*
+ * page was eflushed previously
+ * (currently eflush is not supported)
+ */
+ assert("edward-2213", 0);
+ break;
+ case ALLOCATED_EXTENT:
+ j = jnode_of_page(page);
+ if (IS_ERR(j))
+ return PTR_ERR(j);
+ /*
+ * set "IO address" - a pair (subvolume, block number)
+ */
+ if (j->subvol == NULL)
+ jnode_set_subvol(j, find_data_subvol(coord));
+ else
+ assert("edward-2217",
+ j->subvol == find_data_subvol(coord));
+
+ if (*jnode_get_block(j) == 0) {
+ reiser4_block_nr blocknr;
+
+ blocknr = extent_get_start(ext) + pos;
+ jnode_set_block(j, &blocknr);
+ } else
+ assert("vs-1403",
+ j->blocknr == extent_get_start(ext) + pos);
+ break;
+
+ case UNALLOCATED_EXTENT:
+ assert("edward-2214", 0);
+ j = jfind(mapping, index);
+ assert("nikita-2688", j);
+ assert("vs-1426", jnode_page(j) == NULL);
+
+ spin_lock_jnode(j);
+ jnode_attach_page(j, page);
+ spin_unlock_jnode(j);
+ break;
+
+ default:
+ warning("vs-957", "wrong extent\n");
+ return RETERR(-EIO);
+ }
+ BUG_ON(j == 0);
+ reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
+ jput(j);
+ return 0;
+}
+
+int read_extent_unix_file(struct file *file, flow_t *flow, hint_t *hint)
+{
+ int result;
+ struct page *page;
+ unsigned long page_idx;
+ unsigned long page_off; /* offset within the page to start read from */
+ unsigned long page_cnt; /* bytes which can be read from the page which
+ contains file_off */
+ struct address_space *mapping;
+ loff_t file_off; /* offset in a file to start read from */
+ uf_coord_t *uf_coord;
+ coord_t *coord;
+ struct extent_coord_extension *ext_coord;
+ char *kaddr;
+
+ assert("vs-1353", current_blocksize == PAGE_SIZE);
+ assert("vs-572", flow->user == 1);
+ assert("vs-1351", flow->length > 0);
+
+ uf_coord = &hint->ext_coord;
+
+ check_uf_coord(uf_coord, NULL);
+ assert("vs-33", uf_coord->lh == &hint->lh);
+
+ coord = &uf_coord->coord;
+ assert("vs-1119", znode_is_rlocked(coord->node));
+ assert("vs-1120", znode_is_loaded(coord->node));
+ assert("vs-1256", coord_matches_key_extent(file_inode(file),
+ coord, &flow->key));
+ mapping = file_inode(file)->i_mapping;
+ ext_coord = &uf_coord->extension.extent;
+
+ file_off = get_key_offset(&flow->key);
+ page_off = (unsigned long)(file_off & (PAGE_SIZE - 1));
+ page_cnt = PAGE_SIZE - page_off;
+
+ page_idx = (unsigned long)(file_off >> PAGE_SHIFT);
+
+ /* we start having twig node read locked. However, we do not want to
+ keep that lock all the time readahead works. So, set a seal and
+ release twig node. */
+ reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
+ /* &hint->lh is done-ed */
+
+ do {
+ reiser4_txn_restart_current();
+ page = read_mapping_page(mapping, page_idx, file);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ warning("jmacd-97178",
+ "extent_read: page is not up to date");
+ return RETERR(-EIO);
+ }
+ mark_page_accessed(page);
+ unlock_page(page);
+
+ /* If users can be writing to this page using arbitrary virtual
+ addresses, take care about potential aliasing before reading
+ the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ assert("nikita-3034", reiser4_schedulable());
+
+ /* number of bytes which are to be read from the page */
+ if (page_cnt > flow->length)
+ page_cnt = flow->length;
+
+ result = fault_in_pages_writeable(flow->data, page_cnt);
+ if (result) {
+ put_page(page);
+ return RETERR(-EFAULT);
+ }
+
+ kaddr = kmap_atomic(page);
+ result = __copy_to_user_inatomic(flow->data,
+ kaddr + page_off, page_cnt);
+ kunmap_atomic(kaddr);
+ if (result != 0) {
+ kaddr = kmap(page);
+ result = __copy_to_user(flow->data,
+ kaddr + page_off, page_cnt);
+ kunmap(page);
+ if (unlikely(result))
+ return RETERR(-EFAULT);
+ }
+ put_page(page);
+
+ /* increase (flow->key) offset,
+ * update (flow->data) user area pointer
+ */
+ move_flow_forward(flow, page_cnt);
+
+ page_off = 0;
+ page_idx++;
+
+ } while (flow->length);
+ return 0;
+}
+
+/**
+ * reiser4_read->unix_file_read->page_cache_readahead->
+ * ->reiser4_readpage_dispatch->readpage_unix_file->
+ * ->reiser4_readpage_extent
+ * or
+ * filemap_fault->reiser4_readpage_dispatch->readpage_unix_file->
+ * ->reiser4_readpage_extent
+ *
+ * At the beginning: coord->node is read locked, zloaded, page is
+ * locked, coord is set to existing unit inside of extent item (it
+ * is not necessary that coord matches to page->index)
+ */
+int reiser4_readpage_extent(void *vp, struct page *page)
+{
+ uf_coord_t *uf_coord = vp;
+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
+ ON_DEBUG(reiser4_key key);
+
+ assert("vs-1040", PageLocked(page));
+ assert("vs-1050", !PageUptodate(page));
+ assert("vs-1039", page->mapping && page->mapping->host);
+
+ assert("vs-1044", znode_is_loaded(coord->node));
+ assert("vs-758", item_is_extent(coord));
+ assert("vs-1046", coord_is_existing_unit(coord));
+ assert("vs-1045", znode_is_rlocked(coord->node));
+ assert("vs-1047",
+ page->mapping->host->i_ino ==
+ get_key_objectid(item_key_by_coord(coord, &key)));
+ check_uf_coord(uf_coord, NULL);
+
+ return __reiser4_readpage_extent(&uf_coord->coord,
+ ext_by_ext_coord(uf_coord),
+ uf_coord->extension.extent.pos_in_unit,
+ page);
+}
+
+int get_block_address_extent(const coord_t *coord, sector_t block,
+ sector_t *result)
+{
+ reiser4_extent *ext;
+
+ if (!coord_is_existing_unit(coord))
+ return RETERR(-EINVAL);
+
+ ext = extent_by_coord(coord);
+
+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
+ /* FIXME: bad things may happen if it is unallocated extent */
+ *result = 0;
+ else {
+ reiser4_key key;
+
+ unit_key_by_coord(coord, &key);
+ assert("vs-1645",
+ block >= get_key_offset(&key) >> current_blocksize_bits);
+ assert("vs-1646",
+ block <
+ (get_key_offset(&key) >> current_blocksize_bits) +
+ extent_get_width(ext));
+ *result =
+ extent_get_start(ext) + (block -
+ (get_key_offset(&key) >>
+ current_blocksize_bits));
+ }
+ return 0;
+}
+
+reiser4_key *append_key_extent(const coord_t *coord, reiser4_key *key)
+{
+ item_key_by_coord(coord, key);
+ set_key_offset(key, get_key_offset(key) + reiser4_extent_size(coord));
+
+ assert("vs-610", get_key_offset(key) &&
+ (get_key_offset(key) & (current_blocksize - 1)) == 0);
+ return key;
+}
+
+/* plugin->u.item.s.file.init_coord_extension */
+void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
+{
+ coord_t *coord;
+ struct extent_coord_extension *ext_coord;
+ reiser4_key key;
+ loff_t offset;
+
+ assert("vs-1295", uf_coord->valid == 0);
+
+ coord = &uf_coord->coord;
+ assert("vs-1288", coord_is_iplug_set(coord));
+ assert("vs-1327", znode_is_loaded(coord->node));
+
+ if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
+ return;
+
+ ext_coord = &uf_coord->extension.extent;
+ ext_coord->nr_units = nr_units_extent(coord);
+ ext_coord->ext_offset =
+ (char *)extent_by_coord(coord) - zdata(coord->node);
+ ext_coord->width = extent_get_width(extent_by_coord(coord));
+ ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
+ uf_coord->valid = 1;
+
+ /* pos_in_unit is the only uninitialized field in extended coord */
+ if (coord->between == AFTER_UNIT) {
+ assert("vs-1330",
+ coord->unit_pos == nr_units_extent(coord) - 1);
+
+ ext_coord->pos_in_unit = ext_coord->width - 1;
+ } else {
+ /* AT_UNIT */
+ unit_key_by_coord(coord, &key);
+ offset = get_key_offset(&key);
+
+ assert("vs-1328", offset <= lookuped);
+ assert("vs-1329",
+ lookuped <
+ offset + ext_coord->width * current_blocksize);
+ ext_coord->pos_in_unit =
+ ((lookuped - offset) >> current_blocksize_bits);
+ }
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-5.10.2/fs/reiser4/plugin/item/extent_flush_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent_flush_ops.c 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,759 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "item.h"
+#include "../../tree.h"
+#include "../../jnode.h"
+#include "../../super.h"
+#include "../../flush.h"
+#include "../../carry.h"
+#include "../object.h"
+
+#include <linux/pagemap.h>
+
+static reiser4_block_nr extent_unit_start(const coord_t * item);
+
+/* Return either first or last extent (depending on @side) of the item
+ @coord is set to. Set @pos_in_unit either to first or to last block
+ of extent. */
+static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
+ reiser4_block_nr * pos_in_unit)
+{
+ reiser4_extent *ext;
+
+ if (side == LEFT_SIDE) {
+ /* get first extent of item */
+ ext = extent_item(coord);
+ *pos_in_unit = 0;
+ } else {
+ /* get last extent of item and last position within it */
+ assert("vs-363", side == RIGHT_SIDE);
+ ext = extent_item(coord) + coord_last_unit_pos(coord);
+ *pos_in_unit = extent_get_width(ext) - 1;
+ }
+
+ return ext;
+}
+
+/**
+ * item_plugin->f.utmost_child
+ *
+ * Return the child. Coord is set to extent item.
+ * Find jnode corresponding either to first or to
+ * last unformatted node pointed by the item
+ */
+int utmost_child_extent(const coord_t *coord, sideof side, jnode **childp)
+{
+ reiser4_extent *ext;
+ reiser4_block_nr pos_in_unit;
+ reiser4_subvol *data_subv;
+
+ assert("edward-1851", item_is_extent(coord));
+
+ data_subv = find_data_subvol(coord);
+ ext = extent_utmost_ext(coord, side, &pos_in_unit);
+
+ switch (state_of_extent(ext)) {
+ case HOLE_EXTENT:
+ *childp = NULL;
+ return 0;
+ case ALLOCATED_EXTENT:
+ case UNALLOCATED_EXTENT:
+ break;
+ default:
+ impossible("vs-1417", "Bad state of extent (%d)",
+ state_of_extent(ext));
+ BUG_ON(1);
+ }
+ {
+ reiser4_key key;
+ loff_t offset;
+ unsigned long index;
+ /*
+ * offset of the first or next after last (depending on
+ * @side) byte addressed by the extent
+ */
+ offset = get_key_offset(item_key_by_coord(coord, &key));
+ if (side == RIGHT_SIDE)
+ offset += reiser4_extent_size(coord);
+
+ assert("vs-544", (offset >> PAGE_SHIFT) < ~0ul);
+ /*
+ * index of first or last (depending on @side) page
+ * addressed by the extent
+ */
+ index = (unsigned long)(offset >> PAGE_SHIFT);
+ if (side == RIGHT_SIDE)
+ index--;
+
+ *childp = jlookup(get_key_objectid(&key), index);
+ }
+ return 0;
+}
+
+/* item_plugin->f.utmost_child_real_block */
+/* Return the child's block, if allocated. */
+int
+utmost_child_real_block_extent(const coord_t * coord, sideof side,
+ reiser4_block_nr * block)
+{
+ reiser4_extent *ext;
+
+ ext = extent_by_coord(coord);
+
+ switch (state_of_extent(ext)) {
+ case ALLOCATED_EXTENT:
+ *block = extent_get_start(ext);
+ if (side == RIGHT_SIDE)
+ *block += extent_get_width(ext) - 1;
+ break;
+ case HOLE_EXTENT:
+ case UNALLOCATED_EXTENT:
+ *block = 0;
+ break;
+ default:
+ /* this should never happen */
+ assert("vs-1418", 0);
+ }
+
+ return 0;
+}
+
+/* item_plugin->f.scan */
+/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
+ This scan continues, advancing the parent coordinate, until either it encounters a
+ formatted child or it finishes scanning this node.
+
+ If unallocated, the entire extent must be dirty and in the same atom. (Actually, I'm
+ not sure this is last property (same atom) is enforced, but it should be the case since
+ one atom must write the parent and the others must read the parent, thus fusing?). In
+ any case, the code below asserts this case for unallocated extents. Unallocated
+ extents are thus optimized because we can skip to the endpoint when scanning.
+
+ It returns control to reiser4_scan_extent, handles these terminating conditions,
+ e.g., by loading the next twig.
+*/
+int reiser4_scan_extent(flush_scan * scan)
+{
+ coord_t coord;
+ jnode *neighbor;
+ unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
+ reiser4_block_nr unit_start;
+ __u64 oid;
+ reiser4_key key;
+ int ret = 0, allocated, incr;
+
+ if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
+ /*
+ * Race with truncate, this node is already truncated
+ */
+ scan->stop = 1;
+ return 0;
+ }
+ coord_dup(&coord, &scan->parent_coord);
+
+ assert("jmacd-1404", !reiser4_scan_finished(scan));
+ assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
+ assert("jmacd-1406", jnode_is_unformatted(scan->node));
+ /*
+ * The scan_index variable corresponds to the current page index
+ * of the unformatted block scan position
+ */
+ scan_index = index_jnode(scan->node);
+
+ assert("jmacd-7889", item_is_extent(&coord));
+ repeat:
+ oid = get_key_objectid(item_key_by_coord(&coord, &key));
+
+ allocated = !extent_is_unallocated(&coord);
+ /*
+ * Get the values of this extent unit:
+ */
+ unit_index = extent_unit_index(&coord);
+ unit_width = extent_unit_width(&coord);
+ unit_start = extent_unit_start(&coord);
+
+ assert("jmacd-7187", unit_width > 0);
+ assert("jmacd-7188", scan_index >= unit_index);
+ assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
+ /*
+ * Depending on the scan direction, we set different maximum values
+ * for scan_index (scan_max) and the number of nodes that would be
+ * passed if the scan goes the entire way (scan_dist). Incr is an
+ * integer reflecting the incremental direction of scan_index
+ */
+ if (reiser4_scanning_left(scan)) {
+ scan_max = unit_index;
+ scan_dist = scan_index - unit_index;
+ incr = -1;
+ } else {
+ scan_max = unit_index + unit_width - 1;
+ scan_dist = scan_max - unit_index;
+ incr = +1;
+ }
+ /*
+ * If the extent is allocated we have to check each of its blocks.
+ * If the extent is unallocated we can skip to the scan_max
+ */
+ if (allocated) {
+ do {
+ neighbor = jlookup(oid, scan_index);
+ if (neighbor == NULL)
+ goto stop_same_parent;
+
+ if (scan->node != neighbor &&
+ !reiser4_scan_goto(scan, neighbor)) {
+ /*
+ * @neighbor was jput() by reiser4_scan_goto
+ */
+ goto stop_same_parent;
+ }
+ ret = move_scan_pos(scan, neighbor, 1, &coord);
+ if (ret != 0) {
+ goto exit;
+ }
+ /*
+ * reference to @neighbor is stored in @scan, no need
+ * to jput()
+ */
+ scan_index += incr;
+ } while (incr + scan_max != scan_index);
+ } else {
+ /*
+ * Optimized case for unallocated extents, skip to the end
+ */
+ neighbor = jlookup(oid, scan_max /*index */);
+ if (neighbor == NULL) {
+ /*
+ * Race with truncate
+ */
+ scan->stop = 1;
+ ret = 0;
+ goto exit;
+
+ } else if (!reiser4_scan_goto(scan, neighbor)) {
+ /*
+ * @neighbor was jput() by reiser4_scan_goto
+ */
+ goto stop_same_parent;
+ }
+ assert("zam-1043",
+ reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
+
+ ret = move_scan_pos(scan, neighbor, scan_dist, &coord);
+ if (ret != 0) {
+ goto exit;
+ }
+ }
+ if (coord_sideof_unit(&coord, scan->direction) == 0 &&
+ item_is_extent(&coord)) {
+ /*
+ * Continue as long as there are more extent units
+ */
+ scan_index = extent_unit_index(&coord) +
+ (reiser4_scanning_left(scan) ?
+ extent_unit_width(&coord) - 1 : 0);
+ goto repeat;
+ }
+ if (0) {
+ stop_same_parent:
+ /*
+ * In this case, we leave coord set to the parent of scan->node
+ */
+ scan->stop = 1;
+ } else {
+ /*
+ * scan to be continued,
+ * coord is set to the next item which is either off-the-end
+ * of the node or not an extent
+ */
+ assert("jmacd-8912", scan->stop == 0);
+ assert("jmacd-7812",
+ (coord_is_after_sideof_unit(&coord, scan->direction) ||
+ !item_is_extent(&coord)));
+ }
+ ret = 0;
+ exit:
+ return ret;
+}
+
+/**
+ * When on flush time unallocated extent is to be replaced with allocated one
+ * it may happen that one unallocated extent will have to be replaced with set
+ * of allocated extents. In this case insert_into_item will be called which may
+ * have to add new nodes into tree. Space for that is taken from inviolable
+ * reserve (5%).
+ */
+static reiser4_block_nr reserve_replace(reiser4_subvol *subv)
+{
+ reiser4_block_nr grabbed, needed;
+
+ grabbed = ctx_subvol_grabbed(get_current_context(), subv->id);
+ needed = estimate_one_insert_into_item(&subv->tree);
+ check_me("vpf-340",
+ !reiser4_grab_space_force(needed, BA_RESERVED, subv));
+ return grabbed;
+}
+
+/* Block offset of first block addressed by unit */
+__u64 extent_unit_index(const coord_t * item)
+{
+ reiser4_key key;
+
+ assert("vs-648", coord_is_existing_unit(item));
+ unit_key_by_coord(item, &key);
+ return get_key_offset(&key) >> current_blocksize_bits;
+}
+
+/* AUDIT shouldn't return value be of reiser4_block_nr type?
+ Josh's answer: who knows? Is a "number of blocks" the same type as "block offset"? */
+__u64 extent_unit_width(const coord_t * item)
+{
+ assert("vs-649", coord_is_existing_unit(item));
+ return width_by_coord(item);
+}
+
+/* Starting block location of this unit */
+static reiser4_block_nr extent_unit_start(const coord_t * item)
+{
+ return extent_get_start(extent_by_coord(item));
+}
+
+/**
+ * Split extent unit specified by @coord into 2 extent units
+ *
+ * @pos: position to split;
+ * @adv_to_right: if true, then set @coord to the right extent unit,
+ * otherwise, to the left one
+ */
+int split_extent_unit(coord_t *coord, reiser4_block_nr pos_in_unit,
+ int return_inserted_pos)
+{
+ int result;
+ struct replace_handle *h;
+ item_id extent_id;
+ reiser4_extent *ext;
+ reiser4_dblock_nr start1, start2;
+ reiser4_block_nr was_grabbed;
+
+ ext = extent_by_coord(coord);
+ extent_id = item_id_by_coord(coord);
+
+ assert("edward-2119", extent_get_width(ext) > pos_in_unit);
+
+ start1 = start2 = extent_get_start(ext);
+ if (unlikely(start1 == HOLE_EXTENT_START))
+ return -EIO;
+ else if (start1 != UNALLOCATED_EXTENT_START)
+ /* allocated extent */
+ start2 += pos_in_unit;
+
+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
+ if (h == NULL)
+ return RETERR(-ENOMEM);
+ h->coord = coord;
+ h->lh = znode_lh(coord->node);
+ h->pkey = &h->key;
+ unit_key_by_coord(coord, h->pkey);
+ set_key_offset(h->pkey,
+ (get_key_offset(h->pkey) +
+ pos_in_unit * current_blocksize));
+
+ reiser4_set_extent(find_data_subvol(coord), &h->overwrite,
+ start1, pos_in_unit);
+ reiser4_set_extent(find_data_subvol(coord), &h->new_extents[0],
+ start2, extent_get_width(ext) - pos_in_unit);
+ h->nr_new_extents = 1;
+ h->flags = COPI_DONT_SHIFT_LEFT;
+ h->paste_key = h->key;
+ /*
+ * reserve space for extent unit paste, @grabbed is reserved before
+ */
+ was_grabbed = reserve_replace(get_meta_subvol());
+ result = replace_extent_unit(extent_id, h, return_inserted_pos);
+ /* restore reserved */
+ grabbed2free_mark(was_grabbed, get_meta_subvol());
+ kfree(h);
+ return result;
+}
+
+/**
+ * Pre-condition: We want to replace extent @ext by extent @replace.
+ * Try to merge @replace with previous extent of the item (if there is one).
+ * Return 1 if merging succeeded, 0 - otherwise.
+ */
+static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
+ reiser4_extent *replace)
+{
+ reiser4_key key;
+
+ assert("vs-1415", extent_by_coord(coord) == ext);
+
+ if (coord->unit_pos == 0 ||
+ state_of_extent(ext - 1) != ALLOCATED_EXTENT)
+ /*
+ * left neighbor of @ext either does not exist
+ * or is not allocated extent
+ */
+ return 0;
+ unit_key_by_coord(coord, &key);
+
+ if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
+ extent_get_start(replace))
+ return 0;
+ /*
+ * we can glue, widen previous unit
+ */
+ extent_set_width(find_data_subvol(coord), ext - 1,
+ extent_get_width(ext - 1) + extent_get_width(replace));
+
+ if (extent_get_width(ext) != extent_get_width(replace)) {
+ /* make current extent narrower */
+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
+ extent_set_start(find_data_subvol(coord), ext,
+ extent_get_start(ext) +
+ extent_get_width(replace));
+ extent_set_width(find_data_subvol(coord), ext,
+ extent_get_width(ext) -
+ extent_get_width(replace));
+ } else {
+ /*
+ * current extent completely glued with its left
+ * neighbor, remove it
+ */
+ coord_t from, to;
+
+ coord_dup(&from, coord);
+ from.unit_pos = nr_units_extent(coord) - 1;
+ coord_dup(&to, &from);
+ /*
+ * Currently extent can be cut either from the
+ * beginning or from the end. Our unit can be
+ * in the middle, however. So we work around
+ * this. Move place which got freed after unit
+ * removal to end of item
+ */
+ memmove(ext, ext + 1,
+ (from.unit_pos -
+ coord->unit_pos) * sizeof(reiser4_extent));
+ /*
+ * wipe part of item which is going to be cut, so that
+ * check_node() will not be confused
+ */
+ cut_node_content(&from, &to, NULL, NULL, NULL);
+ }
+ znode_make_dirty(coord->node);
+ /* move coord back */
+ coord->unit_pos--;
+ return 1;
+}
+
+/**
+ * convert_extent_unit - replace extent with 2 ones
+ * @coord: coordinate of extent to be replaced
+ * @replace: extent to overwrite the one @coord is set to
+ *
+ * Overwrites extent @coord is set to and paste one extent unit after
+ * overwritten one if @replace is shorter than initial extent
+ */
+int convert_extent_unit(coord_t *coord, reiser4_extent *replace)
+{
+ int result;
+ struct replace_handle *h;
+ reiser4_extent *ext;
+ item_id extent_id;
+ reiser4_block_nr start, width, new_width;
+ reiser4_block_nr was_grabbed;
+ extent_state state;
+
+ ext = extent_by_coord(coord);
+ extent_id = item_id_by_coord(coord);
+ state = state_of_extent(ext);
+ start = extent_get_start(ext);
+ width = extent_get_width(ext);
+ new_width = extent_get_width(replace);
+
+ assert("vs-1458", (state == UNALLOCATED_EXTENT ||
+ state == ALLOCATED_EXTENT));
+ assert("vs-1459", width >= new_width);
+
+ if (try_to_merge_with_left(coord, ext, replace)) {
+ /*
+ * @replace was merged with left neighbor.
+ * Current unit is either removed or narrowed
+ */
+ if (width == new_width &&
+ coord->unit_pos < coord_last_unit_pos(coord)) {
+ coord_t tcoord;
+ coord_dup(&tcoord, coord);
+ tcoord.unit_pos ++;
+ /*
+ * Current unit has been removed and now @coord
+ * is pointing out to the unit that it was merged
+ * with. Here it can happen that the last one is
+ * mergeable with the right unit (pointed out by
+ * @tcoord). If so, then merge them.
+ */
+ try_to_merge_with_left(&tcoord,
+ extent_by_coord(&tcoord),
+ extent_by_coord(&tcoord));
+ }
+ return 0;
+ }
+ if (width == new_width) {
+ /*
+ * replace current extent with @replace
+ */
+ *ext = *replace;
+ /*
+ * After replacing it can happen that the unit is
+ * mergeable with the right unit (if there is one).
+ * If so, then merge them.
+ */
+ if (coord->unit_pos < coord_last_unit_pos(coord)) {
+ coord_t tcoord;
+ coord_dup(&tcoord, coord);
+ tcoord.unit_pos ++;
+
+ try_to_merge_with_left(&tcoord,
+ extent_by_coord(&tcoord),
+ extent_by_coord(&tcoord));
+ }
+ znode_make_dirty(coord->node);
+ return 0;
+ }
+
+ h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
+ if (h == NULL)
+ return RETERR(-ENOMEM);
+ h->coord = coord;
+ h->lh = znode_lh(coord->node);
+ h->pkey = &h->key;
+ unit_key_by_coord(coord, h->pkey);
+ set_key_offset(h->pkey,
+ (get_key_offset(h->pkey) + new_width * current_blocksize));
+ h->overwrite = *replace;
+
+ /* replace @ext with @replace and padding extent */
+ reiser4_set_extent(find_data_subvol(coord),
+ &h->new_extents[0],
+ (state == ALLOCATED_EXTENT) ?
+ (start + new_width) :
+ UNALLOCATED_EXTENT_START,
+ width - new_width);
+ h->nr_new_extents = 1;
+ h->flags = COPI_DONT_SHIFT_LEFT;
+ h->paste_key = h->key;
+
+ /* reserve space for extent unit paste, @grabbed is reserved before */
+ was_grabbed = reserve_replace(get_meta_subvol());
+ result = replace_extent_unit(extent_id, h, 0 /* leave @coord set
+ to overwritten
+ extent */);
+ /* restore reserved */
+ grabbed2free_mark(was_grabbed, get_meta_subvol());
+ kfree(h);
+ return result;
+}
+
+/**
+ * assign_real_blocknrs
+ * @flush_pos:
+ * @oid: objectid of file jnodes to assign block number to belongs to
+ * @index: first jnode on the range
+ * @count: number of jnodes to assign block numbers to
+ * @first: start of allocated block range
+ *
+ * Assigns block numbers to each of @count jnodes. Index of first jnode is
+ * @index. Jnodes get lookuped with jlookup.
+ */
+void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index, reiser4_block_nr count,
+ reiser4_block_nr first, reiser4_subvol *subv)
+{
+ unsigned long i;
+ txn_atom *atom;
+ int nr;
+
+ atom = atom_locked_by_fq(flush_pos->fq);
+ assert("vs-1468", atom);
+ BUG_ON(atom == NULL);
+
+ nr = 0;
+ for (i = 0; i < count; ++i, ++index) {
+ jnode *node;
+
+ node = jlookup(oid, index);
+ assert("", node != NULL);
+ BUG_ON(node == NULL);
+
+ spin_lock_jnode(node);
+ assert("", !jnode_is_flushprepped(node));
+ assert("vs-1475", node->atom == atom);
+ assert("vs-1476", atomic_read(&node->x_count) > 0);
+
+ JF_CLR(node, JNODE_FLUSH_RESERVED);
+ assert("edward-2218", node->subvol != NULL);
+ jnode_set_block(node, &first);
+ unformatted_make_reloc(node, flush_pos->fq);
+ ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
+ FQ_LIST, 0));
+ spin_unlock_jnode(node);
+ first++;
+
+ atomic_dec(&node->x_count);
+ nr ++;
+ }
+
+ spin_unlock_atom(atom);
+ return;
+}
+
+/**
+ * Find out how many adjacent blocks of an allocated extent (specified
+ * by @index and @count) belong to the atom and are not "flushprepped".
+ * It is used by the flush procedure when making reallocation decisions
+ */
+int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index, unsigned long count)
+{
+ unsigned long i;
+ txn_atom *atom;
+ int nr;
+
+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
+ assert("vs-1468", atom);
+
+ nr = 0;
+
+ for (i = 0; i < count; ++i, ++index) {
+ jnode *node;
+
+ node = jlookup(oid, index);
+ if (!node)
+ break;
+
+ if (jnode_check_flushprepped(node)) {
+ atomic_dec(&node->x_count);
+ break;
+ }
+ if (node->atom != atom) {
+ /*
+ * this is possible on overwrite: extent_write may
+ * capture several unformatted nodes without capturing
+ * any formatted nodes.
+ */
+ atomic_dec(&node->x_count);
+ break;
+ }
+ assert("vs-1476", atomic_read(&node->x_count) > 1);
+ atomic_dec(&node->x_count);
+ nr ++;
+ }
+ spin_unlock_atom(atom);
+ return nr;
+}
+
+static inline int are_units_mergeable(reiser4_extent *left,
+ reiser4_extent *right)
+{
+ if (state_of_extent(left) != state_of_extent(right))
+ return 0;
+ switch (state_of_extent(left)) {
+ case HOLE_EXTENT:
+ return 1;
+ case ALLOCATED_EXTENT:
+ return extent_get_start(left) + extent_get_width(left) ==
+ extent_get_start(right);
+ default:
+ impossible("edward-2092", "Bad extent state (%d)",
+ state_of_extent(left));
+ return 0;
+ }
+}
+
+/**
+ * Copy an extent unit @ext at position @coord to the end of
+ * node @dst.
+ * @key is the key of that extent unit.
+ *
+ * It may have to either insert new item after the last one,
+ * or append last item, or modify last unit of last item to
+ * have greater width. If there is no enough spece on the @dst
+ * then return -E_NODE_FULL
+ */
+int shift_extent_left_begin(znode *dst, const coord_t *coord,
+ const reiser4_key *key, reiser4_extent *ext)
+{
+ int result;
+ coord_t dst_coord;
+ cop_insert_flag flags;
+ reiser4_item_data data;
+
+ coord_init_last_unit(&dst_coord, dst);
+ dst_coord.between = AFTER_UNIT;
+
+ flags = COPI_DONT_SHIFT_LEFT |
+ COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
+
+ if (!are_items_mergeable(&dst_coord, coord))
+ /*
+ * create a new item
+ */
+ result = insert_by_coord(&dst_coord,
+ init_new_extent(item_id_by_coord(coord),
+ &data, ext, 1),
+ key, NULL /*lh */ , flags);
+ else {
+ /*
+ * push to existing item
+ */
+ reiser4_extent *dst_ext;
+ assert("edward-2091", item_is_extent(&dst_coord));
+
+ dst_ext = extent_by_coord(&dst_coord);
+
+ if (are_units_mergeable(dst_ext, ext)) {
+ /*
+ * fast paste
+ */
+ extent_set_width(find_data_subvol(&dst_coord), dst_ext,
+ extent_get_width(dst_ext) +
+ extent_get_width(ext));
+ znode_make_dirty(dst);
+ return 0;
+ }
+ /* paste */
+ result = insert_into_item(&dst_coord, NULL /*lh */, key,
+ init_new_extent(item_id_by_coord(coord),
+ &data, ext, 1),
+ flags);
+ }
+ assert("vs-438", result == 0 || result == -E_NODE_FULL);
+ return result;
+}
+
+/*
+ * complete shifting started by shift_extent_left_begin(). Cut the original unit.
+ */
+int shift_extent_left_complete(coord_t *to, reiser4_key *to_key,
+ znode *left)
+{
+ coord_t from;
+ reiser4_key from_key;
+
+ coord_init_first_unit(&from, to->node);
+ item_key_by_coord(&from, &from_key);
+
+ return cut_node_content(&from, to, &from_key, to_key, NULL);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent.h linux-5.10.2/fs/reiser4/plugin/item/extent.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent.h 2020-12-23 16:07:46.127813261 +0100
@@ -0,0 +1,252 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#ifndef __REISER4_EXTENT_H__
+#define __REISER4_EXTENT_H__
+
+/* on disk extent */
+typedef struct {
+ reiser4_dblock_nr start;
+ reiser4_dblock_nr width;
+} reiser4_extent;
+
+struct extent_stat {
+ int unallocated_units;
+ int unallocated_blocks;
+ int allocated_units;
+ int allocated_blocks;
+ int hole_units;
+ int hole_blocks;
+};
+
+/* extents in an extent item can be either holes, or unallocated or allocated
+ extents */
+typedef enum {
+ HOLE_EXTENT,
+ UNALLOCATED_EXTENT,
+ ALLOCATED_EXTENT
+} extent_state;
+
+#define HOLE_EXTENT_START 0
+#define UNALLOCATED_EXTENT_START 1
+
+struct extent_coord_extension {
+ reiser4_block_nr pos_in_unit;
+ reiser4_block_nr width; /* width of current unit */
+ pos_in_node_t nr_units; /* number of units */
+ int ext_offset; /* offset from the beginning of zdata() */
+ unsigned long expected_page;
+#if REISER4_DEBUG
+ reiser4_extent extent;
+#endif
+};
+
+/* macros to set/get fields of on-disk extent */
+static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
+{
+ return le64_to_cpu(ext->start);
+}
+
+static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
+{
+ return le64_to_cpu(ext->width);
+}
+
+extern __u64 reiser4_subvol_block_count(const reiser4_subvol *);
+
+static inline void extent_set_start(reiser4_subvol *subv, reiser4_extent *ext,
+ reiser4_block_nr start)
+{
+ static_assert(sizeof(ext->start) == 8);
+
+ assert("edward-2269", subv != NULL);
+ assert("nikita-2510", ergo(start > 1,
+ start < reiser4_subvol_block_count(subv)));
+
+ put_unaligned(cpu_to_le64(start), &ext->start);
+}
+
+static inline void extent_set_width(reiser4_subvol *subv,
+ reiser4_extent *ext,
+ reiser4_block_nr width)
+{
+ static_assert(sizeof(ext->width) == 8);
+ assert("edward-2270", width > 0);
+ assert("edward-2271", subv != NULL);
+
+ put_unaligned(cpu_to_le64(width), &ext->width);
+
+ assert("nikita-2511",
+ ergo(extent_get_start(ext) > 1,
+ extent_get_start(ext) + width <=
+ reiser4_subvol_block_count(subv)));
+}
+
+#define extent_item(coord) \
+({ \
+ assert("nikita-3143", item_is_extent(coord)); \
+ ((reiser4_extent *)item_body_by_coord (coord)); \
+})
+
+#define extent_by_coord(coord) \
+({ \
+ assert("nikita-3144", item_is_extent(coord)); \
+ (extent_item (coord) + (coord)->unit_pos); \
+})
+
+#define width_by_coord(coord) \
+({ \
+ assert("nikita-3145", item_is_extent(coord)); \
+ extent_get_width (extent_by_coord(coord)); \
+})
+
+struct carry_cut_data;
+struct carry_kill_data;
+
+/* plugin->u.item.b.* */
+reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
+int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
+ const reiser4_item_data *);
+int mergeable_extent40(const coord_t *p1, const coord_t *p2);
+int mergeable_extent41(const coord_t *p1, const coord_t *p2);
+pos_in_node_t nr_units_extent(const coord_t *);
+lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
+void init_coord_extent(coord_t *);
+int init_extent(coord_t *, reiser4_item_data *);
+int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
+int can_shift_extent(unsigned free_space,
+ coord_t * source, znode * target, shift_direction,
+ unsigned *size, unsigned want);
+void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
+ unsigned count, shift_direction where_is_free_space,
+ unsigned free_space);
+size_t merge_units_extent(coord_t *left, coord_t *right);
+int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
+ struct carry_kill_data *);
+int create_hook_extent(const coord_t * coord, void *arg);
+int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
+reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
+void print_extent(const char *, coord_t *);
+int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
+int utmost_child_real_block_extent(const coord_t * coord, sideof side,
+ reiser4_block_nr * block);
+void item_stat_extent(const coord_t * coord, void *vp);
+int reiser4_check_extent(const coord_t * coord, const char **error);
+
+/* plugin->u.item.s.file */
+ssize_t write_extent_unix_file(struct file *, struct inode * inode,
+ const char __user *, size_t, loff_t *);
+ssize_t write_extent_stripe(struct file *, struct inode * inode,
+ const char __user *, size_t, loff_t *,
+ unsigned flags);
+int read_extent_unix_file(struct file *, flow_t *, hint_t *);
+int read_extent_stripe(struct file *, flow_t *, hint_t *);
+int readpage_extent_stripe(void *, struct page *);
+int reiser4_readpage_extent(void *, struct page *);
+int __reiser4_readpage_extent(const coord_t *coord, reiser4_extent *,
+ reiser4_block_nr, struct page *);
+reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
+void init_coord_extension_extent(uf_coord_t *, loff_t offset);
+int get_block_address_extent(const coord_t *, sector_t block,
+ sector_t * result);
+
+/* plugin->u.item.s.vol */
+int reiser4_migrate_extent(coord_t *coord, reiser4_key *, lock_handle *lh,
+ struct inode *inode, loff_t *done_off, u64 *dst_id);
+/* these are used in flush.c
+ FIXME-VS: should they be somewhere in item_plugin? */
+int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
+int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
+ reiser4_key * stop_key);
+
+int extent_is_unallocated(const coord_t * item); /* True if this extent is unallocated (i.e., not a hole, not allocated). */
+__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
+__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
+
+/* plugin->u.item.f. */
+int reiser4_scan_extent(flush_scan * scan);
+
+reiser4_item_data *init_new_extent(item_id extent_id, reiser4_item_data *data,
+ void *ext_unit, int nr_extents);
+reiser4_block_nr reiser4_extent_size_at(const coord_t *coord, pos_in_node_t nr);
+reiser4_block_nr reiser4_extent_size(const coord_t *coord);
+extent_state state_of_extent(reiser4_extent * ext);
+void reiser4_set_extent(reiser4_subvol *subv, reiser4_extent *,
+ reiser4_block_nr start, reiser4_block_nr width);
+int update_extent_unix_file(struct inode *, jnode *, loff_t pos,
+ int *plugged_hole);
+int update_extent_stripe(hint_t *hint, struct inode *, jnode *, int *plugged_hole,
+ unsigned flags);
+
+#include "../../coord.h"
+#include "../../lock.h"
+#include "../../tap.h"
+
+struct replace_handle {
+ /* these are to be set before calling replace_extent_unit */
+ coord_t *coord;
+ lock_handle *lh;
+ reiser4_key key;
+ reiser4_key *pkey;
+ reiser4_extent overwrite;
+ reiser4_extent new_extents[2];
+ int nr_new_extents;
+ unsigned flags;
+
+ /* these are used by replace_extent_unit */
+ reiser4_item_data item;
+ coord_t coord_after;
+ lock_handle lh_after;
+ tap_t watch;
+ reiser4_key paste_key;
+#if REISER4_DEBUG
+ reiser4_extent orig_ext;
+ reiser4_key tmp;
+#endif
+};
+
+/*
+ * this structure is kmalloced before calling make_extent to avoid
+ * excessive stack consumption on plug_hole->replace_extent_unit()
+ */
+struct make_extent_handle {
+ uf_coord_t *uf_coord;
+ reiser4_block_nr blocknr;
+ int created;
+ struct inode *inode;
+ union {
+ struct {
+ } append;
+ struct replace_handle replace;
+ } u;
+};
+
+int replace_extent_unit(item_id extent_id, struct replace_handle *,
+ int return_inserted_position);
+lock_handle *znode_lh(znode *);
+
+/* the reiser4 repacker support */
+struct repacker_cursor;
+extern int process_extent_backward_for_repacking(tap_t *,
+ struct repacker_cursor *);
+extern int mark_extent_for_repacking(tap_t *, int);
+
+#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
+#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
+
+/* __REISER4_EXTENT_H__ */
+#endif
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-5.10.2/fs/reiser4/plugin/item/extent_item_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent_item_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent_item_ops.c 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,989 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "item.h"
+#include "../../inode.h"
+#include "../../tree_walk.h" /* check_sibling_list() */
+#include "../../page_cache.h"
+#include "../../carry.h"
+
+/* item_plugin->b.max_key_inside */
+reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
+{
+ item_key_by_coord(coord, key);
+ set_key_offset(key, get_key_offset(reiser4_max_key()));
+ return key;
+}
+
+/* item_plugin->b.can_contain_key
+ this checks whether @key of @data is matching to position set by @coord */
+int
+can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
+ const reiser4_item_data * data)
+{
+ reiser4_key item_key;
+
+ if (item_plugin_by_coord(coord) != data->iplug)
+ return 0;
+
+ item_key_by_coord(coord, &item_key);
+ if (get_key_locality(key) != get_key_locality(&item_key) ||
+ get_key_objectid(key) != get_key_objectid(&item_key) ||
+ get_key_ordering(key) != get_key_ordering(&item_key))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * Check if extent @p1 of type @extent_id is mergeable with @p2
+ */
+static inline int mergeable_extent(item_id extent_id,
+ const coord_t *p1, const coord_t *p2)
+{
+ reiser4_key key1, key2;
+
+ assert("vs-299", item_id_by_coord(p1) == extent_id);
+ /*
+ * FIXME-VS: Which is it? Assert or return 0
+ */
+ if (item_id_by_coord(p2) != extent_id)
+ return 0;
+
+ item_key_by_coord(p1, &key1);
+ item_key_by_coord(p2, &key2);
+
+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
+ get_key_objectid(&key1) != get_key_objectid(&key2) ||
+ get_key_ordering(&key1) != get_key_ordering(&key2) ||
+ get_key_type(&key1) != get_key_type(&key2))
+ return 0;
+ if (get_key_offset(&key1) +
+ reiser4_extent_size_at(p1, nr_units_extent(p1)) !=
+ get_key_offset(&key2))
+ return 0;
+ return 1;
+}
+
+/**
+ * item_plugin->b.mergeable
+ */
+int mergeable_extent40(const coord_t *p1, const coord_t *p2)
+{
+ return mergeable_extent(EXTENT40_POINTER_ID, p1, p2);
+}
+
+int mergeable_extent41(const coord_t *p1, const coord_t *p2)
+{
+ return mergeable_extent(EXTENT41_POINTER_ID, p1, p2);
+}
+
+/* item_plugin->b.nr_units */
+pos_in_node_t nr_units_extent(const coord_t * coord)
+{
+ /* length of extent item has to be multiple of extent size */
+ assert("vs-1424",
+ (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
+ return item_length_by_coord(coord) / sizeof(reiser4_extent);
+}
+
+/* item_plugin->b.lookup */
+lookup_result
+lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
+ coord_t * coord)
+{ /* znode and item_pos are
+ set to an extent item to
+ look through */
+ reiser4_key item_key;
+ reiser4_block_nr lookuped, offset;
+ unsigned i, nr_units;
+ reiser4_extent *ext;
+ unsigned blocksize;
+ unsigned char blocksize_bits;
+
+ item_key_by_coord(coord, &item_key);
+ offset = get_key_offset(&item_key);
+
+ /* key we are looking for must be greater than key of item @coord */
+ assert("vs-414", keygt(key, &item_key));
+
+ assert("umka-99945",
+ !keygt(key, max_key_inside_extent(coord, &item_key)));
+
+ ext = extent_item(coord);
+ assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
+
+ blocksize = current_blocksize;
+ blocksize_bits = current_blocksize_bits;
+
+ /* offset we are looking for */
+ lookuped = get_key_offset(key);
+
+ nr_units = nr_units_extent(coord);
+ /* go through all extents until the one which address given offset */
+ for (i = 0; i < nr_units; i++, ext++) {
+ offset += (extent_get_width(ext) << blocksize_bits);
+ if (offset > lookuped) {
+ /* desired byte is somewhere in this extent */
+ coord->unit_pos = i;
+ coord->between = AT_UNIT;
+ return CBK_COORD_FOUND;
+ }
+ }
+
+ /* set coord after last unit */
+ coord->unit_pos = nr_units - 1;
+ coord->between = AFTER_UNIT;
+ return CBK_COORD_FOUND;
+}
+
+/* item_plugin->b.paste
+ item @coord is set to has been appended with @data->length of free
+ space. data->data contains data to be pasted into the item in position
+ @coord->in_item.unit_pos. It must fit into that free space.
+ @coord must be set between units.
+*/
+int
+paste_extent(coord_t * coord, reiser4_item_data * data,
+ carry_plugin_info * info UNUSED_ARG)
+{
+ unsigned old_nr_units;
+ reiser4_extent *ext;
+ int item_length;
+
+ ext = extent_item(coord);
+ item_length = item_length_by_coord(coord);
+ old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
+
+ /* this is also used to copy extent into newly created item, so
+ old_nr_units could be 0 */
+ assert("vs-260", item_length >= data->length);
+
+ /* make sure that coord is set properly */
+ assert("vs-35",
+ ((!coord_is_existing_unit(coord))
+ || (!old_nr_units && !coord->unit_pos)));
+
+ /* first unit to be moved */
+ switch (coord->between) {
+ case AFTER_UNIT:
+ coord->unit_pos++;
+ /* fall through */
+ case BEFORE_UNIT:
+ coord->between = AT_UNIT;
+ break;
+ case AT_UNIT:
+ assert("vs-331", !old_nr_units && !coord->unit_pos);
+ break;
+ default:
+ impossible("vs-330", "coord is set improperly");
+ }
+
+ /* prepare space for new units */
+ memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
+ ext + coord->unit_pos,
+ (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
+
+ /* copy new data from kernel space */
+ assert("vs-556", data->user == 0);
+ memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
+
+ /* after paste @coord is set to first of pasted units */
+ assert("vs-332", coord_is_existing_unit(coord));
+ assert("vs-333",
+ !memcmp(data->data, extent_by_coord(coord),
+ (unsigned)data->length));
+ return 0;
+}
+
+/* item_plugin->b.can_shift */
+int
+can_shift_extent(unsigned free_space, coord_t * source,
+ znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
+ unsigned *size, unsigned want)
+{
+ *size = item_length_by_coord(source);
+ if (*size > free_space)
+ /* never split a unit of extent item */
+ *size = free_space - free_space % sizeof(reiser4_extent);
+
+ /* we can shift *size bytes, calculate how many do we want to shift */
+ if (*size > want * sizeof(reiser4_extent))
+ *size = want * sizeof(reiser4_extent);
+
+ if (*size % sizeof(reiser4_extent) != 0)
+ impossible("vs-119", "Wrong extent size: %i %zd", *size,
+ sizeof(reiser4_extent));
+ return *size / sizeof(reiser4_extent);
+
+}
+
+/* item_plugin->b.copy_units */
+void
+copy_units_extent(coord_t * target, coord_t * source,
+ unsigned from, unsigned count,
+ shift_direction where_is_free_space, unsigned free_space)
+{
+ char *from_ext, *to_ext;
+
+ assert("vs-217", free_space == count * sizeof(reiser4_extent));
+
+ from_ext = item_body_by_coord(source);
+ to_ext = item_body_by_coord(target);
+
+ if (where_is_free_space == SHIFT_LEFT) {
+ assert("vs-215", from == 0);
+
+ /* At this moment, item length was already updated in the item
+ header by shifting code, hence nr_units_extent() will
+ return "new" number of units---one we obtain after copying
+ units.
+ */
+ to_ext +=
+ (nr_units_extent(target) - count) * sizeof(reiser4_extent);
+ } else {
+ reiser4_key key;
+ coord_t coord;
+
+ assert("vs-216",
+ from + count == coord_last_unit_pos(source) + 1);
+
+ from_ext += item_length_by_coord(source) - free_space;
+
+ /* new units are inserted before first unit in an item,
+ therefore, we have to update item key */
+ coord = *source;
+ coord.unit_pos = from;
+ unit_key_extent(&coord, &key);
+
+ node_plugin_by_node(target->node)->update_item_key(target, &key,
+ NULL /*info */);
+ }
+
+ memcpy(to_ext, from_ext, free_space);
+}
+
+/**
+ * item_plugin->b.merge_units. See comment in item.h
+ * Don't use this function in other contexts.
+ */
+size_t merge_units_extent(coord_t *left, coord_t *right)
+{
+ coord_t uleft, uright;
+ reiser4_extent *ext_left;
+ reiser4_extent *ext_right;
+ size_t tail_size;
+
+ assert("edward-2129", item_is_extent(left));
+ assert("edward-2135", item_is_extent(right));
+
+ coord_dup(&uleft, left);
+ uleft.unit_pos = coord_num_units(left) - 1;
+ uleft.between = AT_UNIT;
+
+ coord_dup(&uright, right);
+ uright.unit_pos = 0;
+ uright.between = AT_UNIT;
+
+ ext_left = extent_by_coord(&uleft);
+ ext_right = extent_by_coord(&uright);
+
+ assert("edward-2136", ext_right == ext_left + 1);
+
+ if ((state_of_extent(ext_left) != state_of_extent(ext_right)) ||
+ ((state_of_extent(ext_left) == ALLOCATED_EXTENT) &&
+ (extent_get_start(ext_left) + extent_get_width(ext_left) !=
+ extent_get_start(ext_right))))
+ /* units are not mergeable */
+ return 0;
+ /*
+ * widen @ext_left
+ */
+ extent_set_width(find_data_subvol(&uleft), ext_left,
+ extent_get_width(ext_left) +
+ extent_get_width(ext_right));
+ /*
+ * move units at the right of @ext_right to the left.
+ * This will drop @ext_right and make the node inconsistent
+ * (see the comment above)
+ */
+ tail_size = sizeof(reiser4_extent) * (coord_num_units(right) - 1);
+ memmove(ext_right, ext_right + 1, tail_size);
+ return sizeof(reiser4_extent);
+}
+
+/* item_plugin->b.create_hook
+ @arg is znode of leaf node for which we need to update right delimiting key */
+int create_hook_extent(const coord_t * coord, void *arg)
+{
+ coord_t *child_coord;
+ znode *node;
+ reiser4_key key;
+ reiser4_tree *tree;
+
+ if (!arg)
+ return 0;
+
+ child_coord = arg;
+ tree = znode_get_tree(coord->node);
+
+ assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
+
+ write_lock_tree();
+ write_lock_dk(tree);
+ /* find a node on the left level for which right delimiting key has to
+ be updated */
+ if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
+ assert("vs-411", znode_is_left_connected(child_coord->node));
+ node = child_coord->node->left;
+ } else {
+ assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
+ node = child_coord->node;
+ assert("nikita-3314", node != NULL);
+ }
+
+ if (node != NULL) {
+ znode_set_rd_key(node, item_key_by_coord(coord, &key));
+
+ assert("nikita-3282", check_sibling_list(node));
+ /* break sibling links */
+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
+ ON_DEBUG(node->right->left_version =
+ atomic_inc_return(&delim_key_version);
+ node->right_version =
+ atomic_inc_return(&delim_key_version););
+
+ node->right->left = NULL;
+ node->right = NULL;
+ }
+ }
+ write_unlock_dk(tree);
+ write_unlock_tree();
+ return 0;
+}
+
+#define ITEM_TAIL_KILLED 0
+#define ITEM_HEAD_KILLED 1
+#define ITEM_KILLED 2
+
+/**
+ * item_plugin->b.kill_hook
+ * this is called when @count units starting from @from-th
+ * one are going to be removed
+ */
+int kill_hook_extent(const coord_t *coord, pos_in_node_t from,
+ pos_in_node_t count, struct carry_kill_data *kdata)
+{
+ reiser4_extent *ext;
+ reiser4_block_nr start, length;
+ const reiser4_key *pfrom_key, *pto_key;
+ struct inode *inode;
+ reiser4_tree *tree;
+ pgoff_t from_off, to_off, offset, skip;
+ int retval;
+ reiser4_subvol *subv;
+
+ /* these are located in memory kmalloc-ed by kill_node_content */
+ reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
+ coord_t *dup, *next;
+
+ assert("edward-1850", item_is_extent(coord));
+ assert("zam-811", znode_is_write_locked(coord->node));
+ assert("nikita-3315", kdata != NULL);
+ assert("vs-34", kdata->buf != NULL);
+
+ subv = find_data_subvol(coord);
+ tree = &subv->tree;
+
+ /* map structures to kdata->buf */
+ min_item_key = (reiser4_key *) (kdata->buf);
+ max_item_key = min_item_key + 1;
+ from_key = max_item_key + 1;
+ to_key = from_key + 1;
+ key = to_key + 1;
+ dup = (coord_t *) (key + 1);
+ next = dup + 1;
+
+ item_key_by_coord(coord, min_item_key);
+ max_item_key_by_coord(coord, max_item_key);
+
+ if (kdata->params.from_key) {
+ pfrom_key = kdata->params.from_key;
+ pto_key = kdata->params.to_key;
+ } else {
+ assert("vs-1549", from == coord->unit_pos);
+ unit_key_by_coord(coord, from_key);
+ pfrom_key = from_key;
+
+ coord_dup(dup, coord);
+ dup->unit_pos = from + count - 1;
+ max_unit_key_by_coord(dup, to_key);
+ pto_key = to_key;
+ }
+ /*
+ * Evaluate, which part of the item is to be removed.
+ * Only 3 cases are possible:
+ *
+ * 1. pfrom_key <= min_item_key && max_item_key <= pto_key
+ *
+ * from/to: ***********
+ * item: +++++
+ *
+ * item to be removed completely
+ *
+ * 2. min_item_key < pfrom_key
+ *
+ * from/to: ***********
+ * item: +++++
+ *
+ * tail of the item to be removed
+ *
+ * 3. max_item_key > pto_key;
+ *
+ * from/to: ***********
+ * item: +++++
+ *
+ * head of the item to be removed
+ */
+ if (keyle(pfrom_key, min_item_key) && keyle(max_item_key, pto_key)) {
+ znode *left, *right;
+
+ /* item is to be removed completely */
+ assert("nikita-3316", kdata->left != NULL
+ && kdata->right != NULL);
+
+ left = kdata->left->node;
+ right = kdata->right->node;
+
+ /* we have to do two things:
+ *
+ * 1. link left and right formatted neighbors of
+ * extent being removed, and
+ *
+ * 2. update their delimiting keys.
+ *
+ * atomicity of these operations is protected by
+ * taking dk-lock and tree-lock.
+ */
+ /* if neighbors of item being removed are znodes -
+ * link them */
+ write_lock_tree();
+ write_lock_dk(tree);
+ link_left_and_right(left, right);
+ if (left) {
+ /* update right delimiting key of left
+ * neighbor of extent item */
+ /*coord_t next;
+ reiser4_key key; */
+
+ coord_dup(next, coord);
+
+ if (coord_next_item(next))
+ *key = *znode_get_rd_key(coord->node);
+ else
+ item_key_by_coord(next, key);
+ znode_set_rd_key(left, key);
+ }
+ write_unlock_dk(tree);
+ write_unlock_tree();
+
+ from_off =
+ get_key_offset(min_item_key) >> PAGE_SHIFT;
+ to_off =
+ (get_key_offset(max_item_key) +
+ 1) >> PAGE_SHIFT;
+ retval = ITEM_KILLED;
+
+ } else if (keylt(min_item_key, pfrom_key)) {
+ /*
+ * tail of item is to be removed
+ */
+ from_off =
+ (get_key_offset(pfrom_key) + PAGE_SIZE -
+ 1) >> PAGE_SHIFT;
+ to_off =
+ (get_key_offset(max_item_key) +
+ 1) >> PAGE_SHIFT;
+ retval = ITEM_TAIL_KILLED;
+ } else {
+ assert("edward-2419", keylt(pto_key, max_item_key));
+ /*
+ * head of item is to be removed
+ */
+ assert("vs-1572",
+ (get_key_offset(pfrom_key) & (PAGE_SIZE - 1)) ==
+ 0);
+ assert("vs-1573",
+ ((get_key_offset(pto_key) + 1) & (PAGE_SIZE -
+ 1)) == 0);
+
+ if (kdata->left->node) {
+ /* update right delimiting key of left neighbor of extent item */
+ /*reiser4_key key; */
+
+ *key = *pto_key;
+ set_key_offset(key, get_key_offset(pto_key) + 1);
+
+ write_lock_dk(tree);
+ znode_set_rd_key(kdata->left->node, key);
+ write_unlock_dk(tree);
+ }
+
+ from_off = get_key_offset(pfrom_key) >> PAGE_SHIFT;
+ to_off = (get_key_offset(pto_key) + 1) >> PAGE_SHIFT;
+ retval = ITEM_HEAD_KILLED;
+ }
+
+ inode = kdata->inode;
+ assert("vs-1545", inode != NULL);
+ if (inode != NULL &&
+ !reiser4_inode_get_flag(inode, REISER4_FILE_IN_MIGRATION))
+ /*
+ * take care of pages and jnodes corresponding
+ * to the part of item being killed
+ */
+ reiser4_invalidate_pages(inode->i_mapping, from_off,
+ to_off - from_off,
+ kdata->params.truncate);
+ ext = extent_item(coord) + from;
+ offset =
+ (get_key_offset(min_item_key) +
+ reiser4_extent_size_at(coord, from)) >> PAGE_SHIFT;
+
+ assert("vs-1551", from_off >= offset);
+ assert("vs-1552", from_off - offset <= extent_get_width(ext));
+ skip = from_off - offset;
+ offset = from_off;
+
+ while (offset < to_off) {
+ length = extent_get_width(ext) - skip;
+ if (state_of_extent(ext) == HOLE_EXTENT) {
+ skip = 0;
+ offset += length;
+ ext++;
+ continue;
+ }
+
+ if (offset + length > to_off) {
+ length = to_off - offset;
+ }
+
+ inode_sub_blocks(inode, length);
+
+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
+ /*
+ * some jnodes corresponding to this unallocated extent
+ */
+ fake_allocated2free(length, 0 /* unformatted */, subv);
+ skip = 0;
+ offset += length;
+ ext++;
+ continue;
+ }
+
+ assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
+
+ if (length != 0) {
+ start = extent_get_start(ext) + skip;
+ /*
+ * BA_DEFER bit parameter is turned on because blocks
+ * which get freed are not safe to be freed immediately
+ */
+ reiser4_dealloc_blocks(&start, &length,
+ 0, /* not used */
+ BA_DEFER, /* unformatted with defer */
+ subv);
+ }
+ skip = 0;
+ offset += length;
+ ext++;
+ }
+ return retval;
+}
+
+/* item_plugin->b.kill_units */
+int
+kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ reiser4_extent *ext;
+ reiser4_key item_key;
+ pos_in_node_t count;
+ reiser4_key from_key, to_key;
+ const reiser4_key *pfrom_key, *pto_key;
+ loff_t off;
+ int result;
+
+ assert("vs-1541",
+ ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
+ || (kdata->params.from_key != NULL
+ && kdata->params.to_key != NULL)));
+
+ if (kdata->params.from_key) {
+ pfrom_key = kdata->params.from_key;
+ pto_key = kdata->params.to_key;
+ } else {
+ coord_t dup;
+
+ /* calculate key range of kill */
+ assert("vs-1549", from == coord->unit_pos);
+ unit_key_by_coord(coord, &from_key);
+ pfrom_key = &from_key;
+
+ coord_dup(&dup, coord);
+ dup.unit_pos = to;
+ max_unit_key_by_coord(&dup, &to_key);
+ pto_key = &to_key;
+ }
+
+ item_key_by_coord(coord, &item_key);
+
+#if REISER4_DEBUG
+ {
+ reiser4_key max_item_key;
+
+ max_item_key_by_coord(coord, &max_item_key);
+
+ if (new_first) {
+ /* head of item is to be cut */
+ assert("vs-1542", keyeq(pfrom_key, &item_key));
+ assert("vs-1538", keylt(pto_key, &max_item_key));
+ } else {
+ /* tail of item is to be cut */
+ assert("vs-1540", keygt(pfrom_key, &item_key));
+ assert("vs-1543", !keylt(pto_key, &max_item_key));
+ }
+ }
+#endif
+
+ if (smallest_removed)
+ *smallest_removed = *pfrom_key;
+
+ if (new_first) {
+ /* item head is cut. Item key will change. This new key is calculated here */
+ assert("vs-1556",
+ (get_key_offset(pto_key) & (PAGE_SIZE - 1)) ==
+ (PAGE_SIZE - 1));
+ *new_first = *pto_key;
+ set_key_offset(new_first, get_key_offset(new_first) + 1);
+ }
+
+ count = to - from + 1;
+ result = kill_hook_extent(coord, from, count, kdata);
+ if (result == ITEM_TAIL_KILLED) {
+ assert("vs-1553",
+ get_key_offset(pfrom_key) >=
+ get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, from));
+ off =
+ get_key_offset(pfrom_key) -
+ (get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, from));
+ if (off) {
+ /* unit @from is to be cut partially. Its width decreases */
+ ext = extent_item(coord) + from;
+ extent_set_width(find_data_subvol(coord), ext,
+ (off + PAGE_SIZE -
+ 1) >> PAGE_SHIFT);
+ count--;
+ }
+ } else {
+ __u64 max_to_offset;
+ __u64 rest;
+
+ assert("vs-1575", result == ITEM_HEAD_KILLED);
+ assert("", from == 0);
+ assert("",
+ ((get_key_offset(pto_key) + 1) & (PAGE_SIZE -
+ 1)) == 0);
+ assert("",
+ get_key_offset(pto_key) + 1 >
+ get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, to));
+ max_to_offset =
+ get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, to + 1) - 1;
+ assert("", get_key_offset(pto_key) <= max_to_offset);
+
+ rest =
+ (max_to_offset -
+ get_key_offset(pto_key)) >> PAGE_SHIFT;
+ if (rest) {
+ /* unit @to is to be cut partially */
+ ext = extent_item(coord) + to;
+
+ assert("", extent_get_width(ext) > rest);
+
+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
+ extent_set_start(find_data_subvol(coord), ext,
+ extent_get_start(ext) +
+ (extent_get_width(ext) -
+ rest));
+
+ extent_set_width(find_data_subvol(coord), ext, rest);
+ count--;
+ }
+ }
+ return count * sizeof(reiser4_extent);
+}
+
+/* item_plugin->b.cut_units
+ this is too similar to kill_units_extent */
+int
+cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *cdata, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ reiser4_extent *ext;
+ reiser4_key item_key;
+ pos_in_node_t count;
+ reiser4_key from_key, to_key;
+ const reiser4_key *pfrom_key, *pto_key;
+ loff_t off;
+
+ assert("vs-1541",
+ ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
+ || (cdata->params.from_key != NULL
+ && cdata->params.to_key != NULL)));
+
+ if (cdata->params.from_key) {
+ pfrom_key = cdata->params.from_key;
+ pto_key = cdata->params.to_key;
+ } else {
+ coord_t dup;
+
+ /* calculate key range of kill */
+ coord_dup(&dup, coord);
+ dup.unit_pos = from;
+ unit_key_by_coord(&dup, &from_key);
+
+ dup.unit_pos = to;
+ max_unit_key_by_coord(&dup, &to_key);
+
+ pfrom_key = &from_key;
+ pto_key = &to_key;
+ }
+
+ assert("vs-1555",
+ (get_key_offset(pfrom_key) & (PAGE_SIZE - 1)) == 0);
+ assert("vs-1556",
+ (get_key_offset(pto_key) & (PAGE_SIZE - 1)) ==
+ (PAGE_SIZE - 1));
+
+ item_key_by_coord(coord, &item_key);
+
+#if REISER4_DEBUG
+ {
+ reiser4_key max_item_key;
+
+ assert("vs-1584",
+ get_key_locality(pfrom_key) ==
+ get_key_locality(&item_key));
+ assert("vs-1585",
+ get_key_type(pfrom_key) == get_key_type(&item_key));
+ assert("vs-1586",
+ get_key_objectid(pfrom_key) ==
+ get_key_objectid(&item_key));
+ assert("vs-1587",
+ get_key_ordering(pfrom_key) ==
+ get_key_ordering(&item_key));
+
+ max_item_key_by_coord(coord, &max_item_key);
+
+ if (new_first != NULL) {
+ /* head of item is to be cut */
+ assert("vs-1542", keyeq(pfrom_key, &item_key));
+ assert("vs-1538", keylt(pto_key, &max_item_key));
+ } else {
+ /* tail of item is to be cut */
+ assert("vs-1540", keygt(pfrom_key, &item_key));
+ assert("vs-1543", keyeq(pto_key, &max_item_key));
+ }
+ }
+#endif
+
+ if (smallest_removed)
+ *smallest_removed = *pfrom_key;
+
+ if (new_first) {
+ /* item head is cut. Item key will change. This new key is calculated here */
+ *new_first = *pto_key;
+ set_key_offset(new_first, get_key_offset(new_first) + 1);
+ }
+
+ count = to - from + 1;
+
+ assert("vs-1553",
+ get_key_offset(pfrom_key) >=
+ get_key_offset(&item_key) + reiser4_extent_size_at(coord, from));
+ off =
+ get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, from));
+ if (off) {
+ /* tail of unit @from is to be cut partially. Its width decreases */
+ assert("vs-1582", new_first == NULL);
+ ext = extent_item(coord) + from;
+ extent_set_width(find_data_subvol(coord), ext,
+ off >> PAGE_SHIFT);
+ count--;
+ }
+
+ assert("vs-1554",
+ get_key_offset(pto_key) <=
+ get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, to + 1) - 1);
+ off =
+ (get_key_offset(&item_key) +
+ reiser4_extent_size_at(coord, to + 1) - 1) -
+ get_key_offset(pto_key);
+ if (off) {
+ /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
+ and width decreased. */
+ assert("vs-1583", (off & (PAGE_SIZE - 1)) == 0);
+ ext = extent_item(coord) + to;
+ if (state_of_extent(ext) == ALLOCATED_EXTENT)
+ extent_set_start(find_data_subvol(coord), ext,
+ extent_get_start(ext) +
+ (extent_get_width(ext) -
+ (off >> PAGE_SHIFT)));
+
+ extent_set_width(find_data_subvol(coord), ext,
+ (off >> PAGE_SHIFT));
+ count--;
+ }
+ return count * sizeof(reiser4_extent);
+}
+
+/* item_plugin->b.unit_key */
+reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
+{
+ assert("vs-300", coord_is_existing_unit(coord));
+
+ item_key_by_coord(coord, key);
+ set_key_offset(key,
+ (get_key_offset(key) +
+ reiser4_extent_size_at(coord, coord->unit_pos)));
+
+ return key;
+}
+
+/* item_plugin->b.max_unit_key */
+reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
+{
+ assert("vs-300", coord_is_existing_unit(coord));
+
+ item_key_by_coord(coord, key);
+ set_key_offset(key,
+ (get_key_offset(key) +
+ reiser4_extent_size_at(coord, coord->unit_pos + 1) - 1));
+ return key;
+}
+
+#if REISER4_DEBUG
+
+/* item_plugin->b.check
+ used for debugging, every item should have here the most complete
+ possible check of the consistency of the item that the inventor can
+ construct
+*/
+int reiser4_check_extent(const coord_t *coord /* coord of item to check */,
+ const char **error /* where to store error message */)
+{
+ reiser4_extent *ext, *first;
+ unsigned i, j;
+ reiser4_block_nr blk_cnt;
+ unsigned num_units;
+ oid_t oid;
+ reiser4_key key;
+ coord_t scan;
+ reiser4_subvol *subv;
+
+ assert("vs-933", REISER4_DEBUG);
+
+ if (znode_get_level(coord->node) != TWIG_LEVEL) {
+ *error = "Extent on the wrong level";
+ return -1;
+ }
+ if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
+ *error = "Wrong item size";
+ return -1;
+ }
+ ext = first = extent_item(coord);
+ num_units = coord_num_units(coord);
+ item_key_by_coord(coord, &key);
+ oid = get_key_objectid(&key);
+ coord_dup(&scan, coord);
+ subv = find_data_subvol(coord);
+ blk_cnt = reiser4_subvol_block_count(subv);
+
+ for (i = 0; i < num_units; ++i, ++ext) {
+ __u64 index;
+ reiser4_block_nr start, width;
+
+ scan.unit_pos = i;
+ index = extent_unit_index(&scan);
+ /*
+ * check that all jnodes are present for the unallocated
+ * extent and that subvolumes are set properly
+ */
+ if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
+ for (j = 0; j < extent_get_width(ext); j++) {
+ jnode *node;
+
+ node = jlookup(oid, index + j);
+ if (node == NULL) {
+ print_coord("scan", &scan, 0);
+ *error = "Jnode missing";
+ return -1;
+ }
+ jput(node);
+ }
+ }
+ start = extent_get_start(ext);
+ if (start < 2)
+ continue;
+ /*
+ * extent is allocated one
+ */
+ width = extent_get_width(ext);
+ if (start >= blk_cnt) {
+ *error = "Start too large";
+ return -1;
+ }
+ if (start + width > blk_cnt) {
+ *error = "End too large";
+ return -1;
+ }
+ /*
+ * make sure that this extent does not overlap
+ * with other allocated extents extents
+ */
+ for (j = 0; j < i; j++) {
+ if (state_of_extent(first + j) != ALLOCATED_EXTENT)
+ continue;
+ if (!((extent_get_start(ext) >=
+ extent_get_start(first + j) +
+ extent_get_width(first + j)) ||
+ (extent_get_start(ext) +
+ extent_get_width(ext) <=
+ extent_get_start(first + j)))) {
+ *error = "Extent overlaps with others";
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+#endif /* REISER4_DEBUG */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent_stripe_ops.c linux-5.10.2/fs/reiser4/plugin/item/extent_stripe_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent_stripe_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent_stripe_ops.c 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,701 @@
+/*
+ Copyright (c) 2018-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "item.h"
+#include "../../inode.h"
+#include "../../page_cache.h"
+#include "../object.h"
+#include "../volume/volume.h"
+#include <linux/swap.h>
+
+void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key);
+void check_jnodes(znode *twig, const reiser4_key *key, int count);
+size_t filemap_copy_from_user(struct page *page, unsigned long offset,
+ const char __user *buf, unsigned bytes);
+int find_stripe_item(hint_t *hint, const reiser4_key *key,
+ znode_lock_mode lock_mode, struct inode *inode);
+reiser4_block_nr estimate_write_stripe_meta(int count);
+int update_item_key(coord_t *target, const reiser4_key *key);
+
+int try_merge_with_right_item(coord_t *left)
+{
+ coord_t right;
+
+ coord_dup(&right, left);
+
+ if (coord_next_item(&right))
+ /*
+ * there is no items at the right
+ */
+ return 0;
+ if (are_items_mergeable(left, &right)) {
+ node_plugin_by_node(left->node)->merge_items(left, &right);
+ znode_make_dirty(left->node);
+ }
+ return 0;
+}
+
+int try_merge_with_left_item(coord_t *right)
+{
+ coord_t left;
+
+ coord_dup(&left, right);
+
+ if (coord_prev_item(&left))
+ /*
+ * there is no items at the left
+ */
+ return 0;
+
+ if (are_items_mergeable(&left, right)) {
+ node_plugin_by_node(left.node)->merge_items(&left, right);
+ znode_make_dirty(right->node);
+ }
+ return 0;
+}
+
+static inline int can_push_left(const coord_t *coord, const reiser4_key *key)
+{
+ reiser4_key akey;
+
+ return keyeq(key, append_key_extent(coord, &akey));
+}
+
+static inline int can_push_right(const coord_t *coord, const reiser4_key *key)
+{
+ coord_t right;
+ reiser4_key ikey;
+ reiser4_key pkey;
+
+ coord_dup(&right, coord);
+
+ if (coord_next_item(&right))
+ /*
+ * there is no items at the right
+ */
+ return 0;
+
+ memcpy(&pkey, key, sizeof(*key));
+ set_key_offset(&pkey, get_key_offset(key) + PAGE_SIZE);
+
+ return keyeq(&pkey, item_key_by_coord(&right, &ikey));
+}
+
+/**
+ * Place a pointer to one unallocated physical block to the storage tree
+ *
+ * @key: key of the pointer to push
+ * @uf_coord: location to push (was found by coord_by_key())
+ *
+ * Pre-condition: the logical block is not yet represented by any pointer
+ * in the storage tree (thus, the procedure looks like "plugging a hole")
+ *
+ * First, try to push the pointer to existing items. If impossible, then
+ * create a new extent item
+ */
+static int plug_hole_stripe(coord_t *coord, lock_handle *lh,
+ const reiser4_key *key)
+{
+ int ret = 0;
+ reiser4_extent *ext;
+ reiser4_extent new_ext;
+ reiser4_item_data idata;
+ //ON_DEBUG(const char *error);
+
+ assert("edward-2052", !coord_is_existing_unit(coord));
+
+ if (coord->between != AFTER_UNIT) {
+ /*
+ * there is no file items at the left (in physical
+ * order), thus we are on the leaf level, where the
+ * search procedure has landed. So, use a carry_extent
+ * primitive to insert a new extent item.
+ */
+ znode *twig_node;
+ assert("edward-2053", znode_is_loaded(coord->node));
+ assert("edward-2054", coord->node->level == LEAF_LEVEL);
+ BUG_ON(coord->node->level != LEAF_LEVEL);
+
+ reiser4_set_extent(subvol_by_key(key), &new_ext,
+ UNALLOCATED_EXTENT_START, 1);
+ init_new_extent(EXTENT41_POINTER_ID, &idata, &new_ext, 1);
+ ret = insert_extent_by_coord(coord, &idata, key, lh);
+ if (ret)
+ return ret;
+ /*
+ * A new extent item has been inserted on the twig level.
+ * To merge it with an item at the right we need to find
+ * the insertion point, as carry_extent primitive doesn't
+ * provide it (only lock handle).
+ */
+ twig_node = lh->node;
+ assert("edward-2073", twig_node != coord->node);
+
+ ret = zload(twig_node);
+ if (ret)
+ return ret;
+ coord_init_zero(coord);
+ ret = node_plugin_by_node(twig_node)->lookup(twig_node,
+ key,
+ FIND_EXACT,
+ coord);
+ BUG_ON(ret != NS_FOUND);
+ assert("edward-2074", twig_node == coord->node);
+
+ try_merge_with_right_item(coord);
+#if 0
+ assert("edward-2352",
+ check_node40(twig_node,
+ REISER4_NODE_TREE_STABLE, &error) == 0);
+#endif
+ zrelse(twig_node);
+ return 0;
+ }
+ /*
+ * We are on the twig level.
+ * First, try to push the pointer to existing extent items
+ */
+ assert("edward-2057", item_is_extent(coord));
+
+ if (can_push_left(coord, key)) {
+ /*
+ * push to the end of current item
+ */
+ coord->unit_pos = coord_last_unit_pos(coord);
+ ext = extent_by_coord(coord);
+
+ assert("edward-2267",
+ subvol_by_key(key) == find_data_subvol(coord));
+
+ if ((state_of_extent(ext) == UNALLOCATED_EXTENT)) {
+ /*
+ * fast paste without carry
+ */
+ extent_set_width(subvol_by_key(key), ext,
+ extent_get_width(ext) + 1);
+ znode_make_dirty(coord->node);
+ } else {
+ /*
+ * paste with possible carry
+ */
+ coord->between = AFTER_UNIT;
+ reiser4_set_extent(subvol_by_key(key), &new_ext,
+ UNALLOCATED_EXTENT_START, 1);
+ init_new_extent(EXTENT41_POINTER_ID,
+ &idata, &new_ext, 1);
+ ret = insert_into_item(coord, lh, key, &idata, 0);
+ if (ret)
+ return ret;
+ }
+ return WITH_DATA(lh->node, try_merge_with_right_item(coord));
+
+ } else if (can_push_right(coord, key)) {
+ /*
+ * push to the beginning of the item at right
+ */
+ coord_next_item(coord);
+ ext = extent_by_coord(coord);
+
+ if ((state_of_extent(ext) == UNALLOCATED_EXTENT)) {
+ /*
+ * fast paste
+ */
+ extent_set_width(subvol_by_key(key), ext,
+ extent_get_width(ext) + 1);
+ /*
+ * since we push to the beginning of item,
+ * we need to update its key
+ */
+ return update_item_key(coord, key);
+ } else {
+ /*
+ * paste with possible carry
+ */
+ coord->between = BEFORE_UNIT;
+ reiser4_set_extent(subvol_by_key(key), &new_ext,
+ UNALLOCATED_EXTENT_START, 1);
+ init_new_extent(EXTENT41_POINTER_ID,
+ &idata, &new_ext, 1);
+ return insert_into_item(coord, lh, key, &idata, 0);
+ }
+ /*
+ * note that resulted item is not mergeable with an item
+ * at the left (otherwise we would fall to can_push_left()
+ * branch above)
+ */
+ } else {
+ /*
+ * we can't push to existing items, so create a new one
+ */
+ reiser4_set_extent(subvol_by_key(key), &new_ext,
+ UNALLOCATED_EXTENT_START, 1);
+ init_new_extent(EXTENT41_POINTER_ID, &idata, &new_ext, 1);
+ ret = insert_by_coord(coord, &idata, key, lh, 0);
+ if (ret)
+ return ret;
+ /*
+ * it could happen that the newly created item got
+ * to neighbor node, where it is mergeable with an
+ * item at the right
+ */
+ return WITH_DATA(lh->node, try_merge_with_right_item(coord));
+ }
+}
+
+static int __update_extent_stripe(uf_coord_t *uf_coord, const reiser4_key *key,
+ jnode *node, int *hole_plugged,
+ reiser4_subvol *subv)
+{
+ int ret;
+ reiser4_block_nr block;
+ struct atom_brick_info *abi;
+
+ assert("edward-2468", subv == current_origin(get_key_ordering(key)));
+ assert("edward-2220", node->subvol == NULL || node->subvol == subv);
+
+ if (uf_coord->coord.between != AT_UNIT) {
+ /*
+ * block pointer is not represented by any item in the tree
+ */
+ if (*jnode_get_block(node)) {
+ /*
+ * FIXME: explain in details appearance of such jnodes
+ */
+ spin_lock_jnode(node);
+ node->blocknr = 0;
+ node->subvol = NULL;
+ reiser4_uncapture_jnode(node);
+ }
+ assert("edward-2469", node->subvol == NULL);
+
+ uf_coord->valid = 0;
+ inode_add_blocks(mapping_jnode(node)->host, 1);
+ ret = plug_hole_stripe(&uf_coord->coord, uf_coord->lh, key);
+ if (ret)
+ return ret;
+
+ block = fake_blocknr_unformatted(1, subv);
+ jnode_set_block(node, &block);
+ jnode_set_subvol(node, subv);
+
+ if (hole_plugged)
+ *hole_plugged = 1;
+ JF_SET(node, JNODE_CREATED);
+
+ } else if (*jnode_get_block(node) == 0) {
+ reiser4_extent *ext;
+ struct extent_coord_extension *ext_coord;
+
+ assert("edward-2470", node->subvol == NULL);
+
+ ext_coord = ext_coord_by_uf_coord(uf_coord);
+ check_uf_coord(uf_coord, NULL);
+ ext = (reiser4_extent *)(zdata(uf_coord->coord.node) +
+ uf_coord->extension.extent.ext_offset);
+ if (state_of_extent(ext) != ALLOCATED_EXTENT)
+ return RETERR(-EIO);
+
+ block = extent_get_start(ext) + ext_coord->pos_in_unit;
+ jnode_set_block(node, &block);
+ jnode_set_subvol(node, subv);
+ }
+ /*
+ * make sure that locked twig node contains jnode
+ * we are about to capture
+ */
+ ON_DEBUG(check_jnodes(uf_coord->lh->node, key, 1));
+
+ ret = check_insert_atom_brick_info(node->subvol->id, &abi);
+ if (ret)
+ return ret;
+
+ spin_lock_jnode(node);
+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(ret != 0);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+#if REISER4_DEBUG
+ if (uf_coord->valid)
+ check_uf_coord(uf_coord, key);
+#endif
+ return 0;
+}
+
+/**
+ * Determine on which brick a data page will be stored,
+ * and reserve space on that brick.
+ */
+static int locate_reserve_data(coord_t *coord, lock_handle *lh,
+ reiser4_key *key, struct inode *inode,
+ loff_t pos, jnode *node,
+ reiser4_subvol **loc, unsigned flags)
+{
+ int ret;
+
+ if (coord->between == AT_UNIT) {
+ ret = zload(coord->node);
+ if (ret)
+ return ret;
+ *loc = find_data_subvol(coord);
+ zrelse(coord->node);
+ assert("edward-2360",
+ ergo(node->subvol, node->subvol == *loc));
+ } else if (reiser4_is_set(reiser4_get_current_sb(),
+ REISER4_PROXY_IO) &&
+ !(flags & UPX_PROXY_FULL))
+ *loc = get_proxy_subvol();
+ else
+ *loc = calc_data_subvol(inode, pos);
+
+ assert("edward-2361", *loc != NULL);
+ /*
+ * Now we can reserve space on @loc.
+ * Note that in the case of truncate the space
+ * has been already reserved in shorten_stripe()
+ */
+ if (flags & UPX_TRUNCATE)
+ return 0;
+ grab_space_enable();
+ return reiser4_grab_space(1 /* count */,
+ 0 /* flags */,
+ *loc /* where */);
+}
+
+#define FAST_SEQ_WRITE (1)
+
+/**
+ * Update file body after writing @count blocks at offset @pos.
+ * Return 0 on success.
+ */
+int update_extent_stripe(struct hint *hint, struct inode *inode,
+ jnode *node, int *plugged_hole, unsigned flags)
+{
+ int ret = 0;
+ reiser4_key key;
+ loff_t off;
+ znode *loaded;
+ reiser4_subvol *dsubv = NULL;
+
+ off = ((loff_t)index_jnode(node) << PAGE_SHIFT);
+ /*
+ * construct non-precise key
+ */
+ build_body_key_stripe(inode, off, &key);
+
+#if FAST_SEQ_WRITE
+ ret = find_stripe_item(hint, &key, ZNODE_WRITE_LOCK, inode);
+#else
+ ret = find_file_item_nohint(&hint->ext_coord.coord,
+ hint->ext_coord.lh, &key,
+ ZNODE_WRITE_LOCK, inode);
+#endif
+ if (IS_CBKERR(ret))
+ return RETERR(-EIO);
+ /*
+ * reserve space for data
+ */
+ ret = locate_reserve_data(&hint->ext_coord.coord,
+ hint->ext_coord.lh, &key,
+ inode, off, node,
+ &dsubv, flags);
+ if (ret) {
+ done_lh(hint->ext_coord.lh);
+ return ret;
+ }
+ assert("edward-2284", dsubv != NULL);
+ assert("edward-2362",
+ ergo(node->subvol, node->subvol == dsubv));
+ /*
+ * Now when we know location of data block, make key precise
+ */
+ set_key_ordering(&key, dsubv->id);
+
+ loaded = hint->ext_coord.coord.node;
+ ret = zload(loaded);
+ if (ret) {
+ done_lh(hint->ext_coord.lh);
+ return ret;
+ }
+ if (hint->ext_coord.coord.between == AT_UNIT &&
+ !hint->ext_coord.valid)
+ init_coord_extension_extent(&hint->ext_coord,
+ get_key_offset(&key));
+ /*
+ * "overwrite" a block pointer, or create a new one,
+ * if it doesn't exist
+ */
+ ret = __update_extent_stripe(&hint->ext_coord, &key, node,
+ plugged_hole, dsubv);
+ zrelse(loaded);
+ if (ret == -ENOSPC) {
+ done_lh(hint->ext_coord.lh);
+ return ret;
+ } else if (ret) {
+ reiser4_unset_hint(hint);
+ return ret;
+ }
+ loaded = hint->lh.node;
+ ret = zload(loaded);
+ if (unlikely(ret)) {
+ done_lh(hint->ext_coord.lh);
+ return ret;
+ }
+ /*
+ * at this point a block pointer with @key always
+ * exists in the storage tree
+ */
+ if (hint->ext_coord.valid == 0) {
+ hint->ext_coord.coord.between = AT_UNIT;
+ init_coord_extension_extent(&hint->ext_coord,
+ get_key_offset(&key));
+ }
+ /*
+ * @hint->ext_coord points out to the block pointer
+ * we have just processed.
+ * Seal the coord and unlock znode.
+ */
+ reiser4_set_hint(hint, &key, ZNODE_WRITE_LOCK);
+ zrelse(loaded);
+ /*
+ * Update key for the next iteration.
+ * We don't know location of the next data block,
+ * so set maximal ordering value.
+ */
+ set_key_offset(&key, off + PAGE_SIZE);
+ set_key_ordering(&key, KEY_ORDERING_MASK);
+ return 0;
+}
+
+int find_or_create_extent_stripe(struct page *page, unsigned flags)
+{
+ int ret;
+ struct inode *inode;
+ int plugged_hole = 0;
+ struct hint hint;
+ jnode *node;
+
+ assert("edward-2372", page->mapping && page->mapping->host);
+
+ hint_init_zero(&hint);
+ inode = page->mapping->host;
+
+ lock_page(page);
+ node = jnode_of_page(page);
+ if (IS_ERR(node)) {
+ unlock_page(page);
+ return PTR_ERR(node);
+ }
+ JF_SET(node, JNODE_WRITE_PREPARED);
+ unlock_page(page);
+
+ ret = update_extent_stripe(&hint, inode, node,
+ &plugged_hole, flags);
+ JF_CLR(node, JNODE_WRITE_PREPARED);
+
+ if (ret) {
+ jput(node);
+ warning("edward-1549",
+ "failed to update extent (%d)", ret);
+ return ret;
+ }
+ if (plugged_hole)
+ reiser4_update_sd(inode);
+
+ BUG_ON(node->atom == NULL);
+
+ if (get_current_context()->entd) {
+ entd_context *ent = get_entd_context(inode->i_sb);
+
+ if (ent->cur_request->page == page)
+ /*
+ * the following reference will be
+ * dropped in reiser4_writeout
+ */
+ ent->cur_request->node = jref(node);
+ }
+ jput(node);
+ return 0;
+}
+
+/*
+ * Non-exclusive access to the file must be acquired
+ */
+ssize_t write_extent_stripe(struct file *file, struct inode *inode,
+ const char __user *buf, size_t count,
+ loff_t *pos, unsigned flags)
+{
+ int nr_pages;
+ int nr_dirty = 0;
+ struct page *page;
+ jnode *jnodes[DEFAULT_WRITE_GRANULARITY + 1];
+ unsigned long index;
+ unsigned long end;
+ int i;
+ int to_page, page_off;
+ size_t left = count;
+ int ret = 0;
+ struct hint hint;
+
+ if (count == 0)
+ return 0;
+
+ ret = load_file_hint(file, &hint);
+ if (ret)
+ return ret;
+ /*
+ * calculate number of pages which are to be written
+ */
+ index = *pos >> PAGE_SHIFT;
+ end = ((*pos + count - 1) >> PAGE_SHIFT);
+ nr_pages = end - index + 1;
+ assert("edward-2363", nr_pages <= DEFAULT_WRITE_GRANULARITY + 1);
+
+ if (count == 0)
+ return 0;
+ /*
+ * First of all reserve space on meta-data brick.
+ * In particular, it is needed to "drill" the leaf level
+ * by search procedure.
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(estimate_write_stripe_meta(nr_pages),
+ 0, /* flags */
+ get_meta_subvol() /* where */);
+ if (ret)
+ return ret;
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ /* get pages and jnodes */
+ for (i = 0; i < nr_pages; i ++) {
+ page = find_or_create_page(inode->i_mapping, index + i,
+ reiser4_ctx_gfp_mask_get());
+ if (page == NULL) {
+ nr_pages = i;
+ ret = RETERR(-ENOMEM);
+ goto out;
+ }
+ jnodes[i] = jnode_of_page(page);
+ if (IS_ERR(jnodes[i])) {
+ unlock_page(page);
+ put_page(page);
+ nr_pages = i;
+ ret = RETERR(-ENOMEM);
+ goto out;
+ }
+ /* prevent jnode and page from disconnecting */
+ JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
+ unlock_page(page);
+ }
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ page_off = (*pos & (PAGE_SIZE - 1));
+ for (i = 0; i < nr_pages; i ++) {
+ size_t written;
+ to_page = PAGE_SIZE - page_off;
+ if (to_page > left)
+ to_page = left;
+ page = jnode_page(jnodes[i]);
+ if (page_offset(page) < inode->i_size &&
+ !PageUptodate(page) && to_page != PAGE_SIZE) {
+ /*
+ * the above is not optimal for partial write to last
+ * page of file when file size is not at boundary of
+ * page
+ */
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ ret = readpage_stripe(NULL, page);
+ assert("edward-2364", ret == 0);
+ BUG_ON(ret != 0);
+ /* wait for read completion */
+ lock_page(page);
+ BUG_ON(!PageUptodate(page));
+ } else
+ ret = 0;
+ unlock_page(page);
+ }
+
+ BUG_ON(get_current_context()->trans->atom != NULL);
+ fault_in_pages_readable(buf, to_page);
+ BUG_ON(get_current_context()->trans->atom != NULL);
+
+ lock_page(page);
+ if (!PageUptodate(page) && to_page != PAGE_SIZE)
+ zero_user_segments(page, 0, page_off,
+ page_off + to_page,
+ PAGE_SIZE);
+
+ written = filemap_copy_from_user(page, page_off, buf, to_page);
+ if (unlikely(written != to_page)) {
+ unlock_page(page);
+ ret = RETERR(-EFAULT);
+ break;
+ }
+
+ flush_dcache_page(page);
+ set_page_dirty_notag(page);
+ unlock_page(page);
+ nr_dirty ++;
+
+ mark_page_accessed(page);
+ SetPageUptodate(page);
+
+ page_off = 0;
+ buf += to_page;
+ left -= to_page;
+ BUG_ON(get_current_context()->trans->atom != NULL);
+ }
+
+ left = count;
+ page_off = (*pos & (PAGE_SIZE - 1));
+
+ for (i = 0; i < nr_dirty; i ++) {
+ to_page = PAGE_SIZE - page_off;
+ if (to_page > left)
+ to_page = left;
+
+ assert("edward-2063", reiser4_lock_counters()->d_refs == 0);
+
+ ret = update_extent_stripe(&hint, inode, jnodes[i],
+ NULL, flags);
+
+ assert("edward-2065", reiser4_lock_counters()->d_refs == 0);
+ assert("edward-2365",
+ ret == -ENOSPC || ret == -EBUSY || ret >= 0);
+
+ if (ret)
+ break;
+ page_off = 0;
+ left -= to_page;
+ }
+ out:
+ for (i = 0; i < nr_pages; i ++) {
+ put_page(jnode_page(jnodes[i]));
+ JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
+ jput(jnodes[i]);
+ }
+ if (!ret)
+ save_file_hint(file, &hint);
+ /*
+ * the only errors handled so far is ENOMEM and
+ * EFAULT on copy_from_user
+ */
+ return (count - left) ? (count - left) : ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/extent_volume_ops.c linux-5.10.2/fs/reiser4/plugin/item/extent_volume_ops.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/extent_volume_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/extent_volume_ops.c 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,788 @@
+/*
+ Copyright (c) 2017-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "item.h"
+#include "../../inode.h"
+#include "../../page_cache.h"
+#include "../object.h"
+#include "../volume/volume.h"
+
+#define MIGRATION_GRANULARITY (8192)
+
+int try_merge_with_right_item(coord_t *left);
+int try_merge_with_left_item(coord_t *right);
+int split_extent_unit(coord_t *coord, reiser4_block_nr pos, int adv_to_right);
+int update_item_key(coord_t *target, const reiser4_key *key);
+
+/*
+ * primitive migration operations over item
+ */
+enum migration_primitive_id {
+ INVALID_ACTION = 0,
+ MIGRATE_EXTENT = 1,
+ SPLIT_EXTENT = 2,
+ SKIP_EXTENT = 3
+};
+
+struct extent_migrate_context {
+ enum migration_primitive_id act;
+ struct page **pages;
+ int nr_pages;
+ coord_t *coord;
+ reiser4_key *key; /* key of extent item to be migrated */
+ struct inode *inode;
+ u32 new_loc;
+ loff_t stop_off; /* offset of the leftmost byte to be migrated
+ in the iteration */
+ loff_t done_off; /* offset of the latest byte migrated in the
+ iteration */
+ reiser4_block_nr blocks_migrated;
+ reiser4_block_nr unit_split_pos; /* position in unit */
+ lock_handle *lh;
+ unsigned int migrate_whole_item:1;
+ unsigned int stop:1;
+};
+
+struct extent_ra_ctx {
+ const coord_t *coord;
+ reiser4_extent *ext;
+ reiser4_block_nr off;
+};
+
+/**
+ * read page pointed out by extent item
+ */
+static int filler(void *data, struct page *page)
+{
+ struct extent_ra_ctx *ra_ctx = data;
+
+ return __reiser4_readpage_extent(ra_ctx->coord, ra_ctx->ext,
+ ra_ctx->off, page);
+}
+
+/**
+ * read all pages pointed out by extent unit @ext starting from @off
+ * @idx: index of the first page pointed out by extent unit
+ */
+static int readpages_extent_unit(const coord_t *coord, reiser4_extent *ext,
+ reiser4_block_nr off_in_unit,
+ struct address_space *mapping, pgoff_t idx,
+ struct page **pages, int off_in_pages)
+{
+ int i;
+ int ret;
+ struct extent_ra_ctx ra_ctx;
+ int nr_pages = extent_get_width(ext) - off_in_unit;
+
+ ra_ctx.coord = coord;
+ ra_ctx.ext = ext;
+ ra_ctx.off = off_in_unit;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+ page = read_cache_page(mapping, idx + off_in_unit + i,
+ filler, &ra_ctx);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ nr_pages = i;
+ goto error;
+ }
+ pages[i + off_in_pages] = page;
+ ra_ctx.off ++;
+ }
+ return 0;
+ error:
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i + off_in_pages]);
+ return ret;
+}
+
+/**
+ * read and pin pages pointed out by extent item at @coord
+ * starting from offset @off
+ */
+static int readpages_extent_item(const coord_t *coord, loff_t off,
+ struct address_space *mapping,
+ struct extent_migrate_context *mctx)
+{
+ int i;
+ int ret;
+ int nr_pages = 0;
+ reiser4_key key; /* key of the first unit to read from */
+ coord_t iter_coord;
+ reiser4_block_nr pos_in_unit;
+ pgoff_t idx;
+
+ coord_dup(&iter_coord, coord);
+ unit_key_by_coord(coord, &key);
+
+ pos_in_unit = 0;
+ if (get_key_offset(&key) < off) {
+ /*
+ * read from the middle of unit
+ */
+ pos_in_unit = mctx->unit_split_pos;
+ assert("edward-2403",
+ pos_in_unit ==
+ (off - get_key_offset(&key)) >> PAGE_SHIFT);
+ }
+ idx = get_key_offset(&key) >> PAGE_SHIFT;
+
+ while (iter_coord.unit_pos <= coord_last_unit_pos(&iter_coord)) {
+ reiser4_extent *ext = extent_by_coord(&iter_coord);
+
+ assert("edward-2404", pos_in_unit < extent_get_width(ext));
+
+ ret = readpages_extent_unit(coord, ext, pos_in_unit, mapping,
+ idx, mctx->pages, nr_pages);
+ if (ret)
+ goto error;
+
+ nr_pages += (extent_get_width(ext) - pos_in_unit);
+ idx += extent_get_width(ext);
+ iter_coord.unit_pos += 1;
+ pos_in_unit = 0;
+ }
+ assert("edward-2405", nr_pages == mctx->nr_pages);
+ return 0;
+ error:
+ for (i = 0; i < nr_pages; i++)
+ put_page(mctx->pages[i]);
+ return ret;
+}
+
+/**
+ * "cut off" a number of unformatted blocks at the end of extent item
+ * specified by @coord.
+ * @from_off: offset to cut from.
+ */
+static int cut_off_tail(coord_t *coord, struct inode *inode,
+ loff_t from_off)
+{
+ reiser4_key from, to;
+ coord_t from_coord;
+ coord_t to_coord;
+
+ coord_dup(&from_coord, coord);
+
+ coord_dup(&to_coord, coord);
+ to_coord.between = AT_UNIT;
+ to_coord.unit_pos = coord_last_unit_pos(coord);
+
+ item_key_by_coord(coord, &to);
+ set_key_offset(&to, get_key_offset(&to) +
+ reiser4_extent_size(coord) - 1);
+
+ from = to;
+ set_key_offset(&from, from_off);
+
+ return kill_node_content(&from_coord, &to_coord, &from, &to,
+ NULL, NULL, inode, 0);
+}
+
+static int migrate_blocks(struct extent_migrate_context *mctx)
+{
+ int i;
+ int ret;
+ reiser4_key key;
+ reiser4_extent new_ext;
+ reiser4_item_data idata;
+ reiser4_block_nr block;
+ int nr_jnodes = 0;
+ coord_t *coord = mctx->coord;
+ znode *loaded;
+ reiser4_subvol *new_subv = current_origin(mctx->new_loc);
+ struct atom_brick_info *abi;
+ ON_DEBUG(reiser4_key check_key);
+ ON_DEBUG(const char *error);
+
+ assert("edward-2406",
+ equi(mctx->migrate_whole_item,
+ keyeq(unit_key_by_coord(coord, &check_key), mctx->key) &&
+ mctx->stop_off == get_key_offset(mctx->key)));
+ /*
+ * Reserve space on the new data brick.
+ * Balancing procedure is allowed to fail with ENOSPC.
+ */
+ grab_space_enable();
+ ret = reiser4_grab_space(mctx->nr_pages, 0, new_subv);
+ if (ret)
+ return ret;
+ ret = readpages_extent_item(coord, mctx->stop_off,
+ mctx->inode->i_mapping, mctx);
+ if (ret)
+ return ret;
+
+ memcpy(&key, mctx->key, sizeof(key));
+ set_key_offset(&key, mctx->stop_off);
+ set_key_ordering(&key, mctx->new_loc);
+
+ for (i = 0; i < mctx->nr_pages; i++) {
+ struct page *page = mctx->pages[i];
+ jnode *node;
+
+ assert("edward-2407", page != NULL);
+ assert("edward-2408",
+ page->index == (mctx->stop_off >> PAGE_SHIFT) + i);
+ lock_page(page);
+ node = jnode_of_page(page);
+ if (IS_ERR(node)) {
+ nr_jnodes = i;
+ unlock_page(page);
+ ret = PTR_ERR(node);
+ goto error;
+ }
+ JF_SET(node, JNODE_WRITE_PREPARED);
+ unlock_page(page);
+
+ }
+ nr_jnodes = mctx->nr_pages;
+
+ if (mctx->migrate_whole_item) {
+ reiser4_extent *ext;
+
+ assert("edward-2464", coord->unit_pos == 0);
+ assert("edward-2465", mctx->stop_off ==
+ get_key_offset(item_key_by_coord(coord, &check_key)));
+ /*
+ * cut all units except the first one;
+ * deallocate all blocks, pointed out by that first unit;
+ * set that unit as unallocated extent of proper width;
+ * update item's key to point out to the new brick;
+ * try to merge the resulted item with the item at left
+ * and right.
+ */
+ if (nr_units_extent(coord) > 1) {
+ ret = cut_off_tail(coord, mctx->inode, mctx->stop_off +
+ reiser4_extent_size_at(coord, 1));
+ if (ret)
+ goto error;
+ }
+ loaded = coord->node;
+ ret = zload(loaded);
+ if (ret)
+ goto error;
+
+ ext = extent_by_coord(coord);
+ if (state_of_extent(ext) == ALLOCATED_EXTENT) {
+ reiser4_block_nr start = extent_get_start(ext);
+ reiser4_block_nr len = extent_get_width(ext);
+
+ reiser4_dealloc_blocks(&start,
+ &len,
+ 0, BA_DEFER,
+ find_data_subvol(coord));
+ }
+ reiser4_set_extent(new_subv, ext,
+ UNALLOCATED_EXTENT_START, nr_jnodes);
+ ret = update_item_key(coord, &key);
+ if (ret) {
+ zrelse(loaded);
+ goto error;
+ }
+ try_merge_with_right_item(coord);
+ try_merge_with_left_item(coord);
+ assert("edward-2466",
+ check_node40(coord->node,
+ REISER4_NODE_TREE_STABLE, &error) == 0);
+ zrelse(loaded);
+ } else {
+ /*
+ * cut of tail, insert a new item at the end
+ */
+ ret = cut_off_tail(coord, mctx->inode, mctx->stop_off);
+ if (ret)
+ goto error;
+
+ coord_init_after_item(coord);
+
+ reiser4_set_extent(new_subv, &new_ext,
+ UNALLOCATED_EXTENT_START, mctx->nr_pages);
+ init_new_extent(EXTENT41_POINTER_ID, &idata, &new_ext, 1);
+ ret = insert_by_coord(coord, &idata, &key, mctx->lh, 0);
+ if (ret)
+ goto error;
+
+ loaded = coord->node;
+ ret = zload(loaded);
+ if (ret)
+ goto error;
+ assert("edward-2416",
+ keyeq(&key, item_key_by_coord(coord, &check_key)));
+ assert("edward-2424",
+ reiser4_extent_size(coord) == mctx->nr_pages << PAGE_SHIFT);
+
+ try_merge_with_right_item(coord);
+
+ assert("edward-2425",
+ check_node40(coord->node,
+ REISER4_NODE_TREE_STABLE, &error) == 0);
+ zrelse(loaded);
+ }
+ /*
+ * Capture jnodes, set new addresses for them,
+ * and make them dirty. At flush time all the
+ * blocks will get new location on the new brick.
+ */
+ ret = check_insert_atom_brick_info(new_subv->id, &abi);
+ if (ret)
+ goto error;
+ block = fake_blocknr_unformatted(mctx->nr_pages, new_subv);
+
+ for (i = 0; i < mctx->nr_pages; i++, block++) {
+ jnode *node = jprivate(mctx->pages[i]);
+
+ assert("edward-2417", node != NULL);
+
+ set_page_dirty_notag(mctx->pages[i]);
+
+ spin_lock_jnode(node);
+ JF_SET(node, JNODE_CREATED);
+ JF_CLR(node, JNODE_WRITE_PREPARED);
+
+ node->subvol = new_subv;
+ jnode_set_block(node, &block);
+
+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(ret != 0);
+
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+
+ jput(node);
+ put_page(mctx->pages[i]);
+ }
+ return 0;
+ error:
+ for (i = 0; i < mctx->nr_pages; i++) {
+ if (i < nr_jnodes)
+ jput(jprivate(mctx->pages[i]));
+ put_page(mctx->pages[i]);
+ }
+ return ret;
+}
+
+static int do_migrate_extent(struct extent_migrate_context *mctx)
+{
+ int ret = 0;
+ lock_handle *lh = mctx->lh;
+ coord_t *coord = mctx->coord;
+ znode *loaded;
+
+ assert("edward-2106", coord->node == lh->node);
+ assert("edward-2128",
+ get_key_ordering(mctx->key) != mctx->new_loc);
+
+ ret = zload(coord->node);
+ if (ret)
+ return ret;
+ loaded = coord->node;
+
+ mctx->nr_pages = (get_key_offset(mctx->key) +
+ reiser4_extent_size(coord) -
+ mctx->stop_off) >> PAGE_SHIFT;
+ mctx->pages = reiser4_vmalloc(sizeof(mctx->pages) * mctx->nr_pages);
+ if (!mctx->pages) {
+ zrelse(loaded);
+ return RETERR(-ENOMEM);
+ }
+ ret = migrate_blocks(mctx);
+
+ vfree(mctx->pages);
+ zrelse(loaded);
+ done_lh(lh);
+ if (ret)
+ return ret;
+ mctx->done_off = mctx->stop_off;
+ mctx->blocks_migrated += mctx->nr_pages;
+
+ reiser4_throttle_write(mctx->inode);
+ /*
+ * Release the rest of blocks we grabbed for the fulfilled
+ * iteration.
+ */
+ all_grabbed2free();
+ /*
+ * The next migrate-split iteration starts here.
+ * Grab disk space for meta-data blocks in this iteration. We grab
+ * from reserved area, as rebalancing can be launched on a volume
+ * with no free space.
+ */
+ ret = reserve_migration_iter();
+ if (ret)
+ return ret;
+ if (mctx->migrate_whole_item) {
+ /*
+ * no more blocks to be migrated in this item
+ */
+ mctx->stop = 1;
+ return 0;
+ }
+ /*
+ * go to the leftmost non-processed item
+ */
+ assert("edward-2418", mctx->done_off != 0);
+
+ set_key_offset(mctx->key, mctx->done_off - 1);
+ ret = find_file_item_nohint(coord, lh, mctx->key,
+ ZNODE_WRITE_LOCK, mctx->inode);
+ if (ret) {
+ /*
+ * item not found (killed by concurrent
+ * truncate, or error happened)
+ */
+ warning("edward-2318",
+ "Item not found after migration (%d)", ret);
+ done_lh(lh);
+ if (!IS_CBKERR(ret)) {
+ ret = 0;
+ mctx->stop = 1;
+ }
+ return ret;
+ }
+ /*
+ * reset @mctx->key, as the item could be changed
+ * while we had keeping the lock released.
+ */
+ ret = zload(coord->node);
+ if (ret)
+ return ret;
+ item_key_by_coord(coord, mctx->key);
+ zrelse(coord->node);
+ return 0;
+}
+
+/**
+ * Create a new extent item right after the item specified by @coord
+ * and move the tail part of the last one to that newly created item. It can
+ * involve carry, if there is no free space on the node. Subtle!
+ *
+ * @unit_split_pos: splitting position in the unit.
+ * The pair @coord and @unit_split_pos defines splitting position in the item.
+ * If @unit_split_pos != 0, then the unit at @coord will be split at
+ * @unit_split_pos offset and its right part will start the new item.
+ * Otherwise, we'll split at the unit boundary and the unit at @coord will be
+ * moved to the head of the new item.
+ *
+ * Upon successfull completion:
+ * if @unit_split_pos != 0, then @coord points out to the same unit, which
+ * became smaller after split. Otherwise, @coord points out to the preceding
+ * unit.
+ */
+static int split_extent_item(coord_t *coord, reiser4_block_nr unit_split_pos)
+{
+ int ret;
+ coord_t cut_from;
+ coord_t cut_to;
+ char *tail_copy;
+ char *tail_orig;
+ int tail_num_units;
+ int tail_len;
+ reiser4_item_data idata;
+ reiser4_key split_key;
+ reiser4_key item_key;
+ ON_DEBUG(reiser4_key check_key);
+
+ assert("edward-2109", znode_is_loaded(coord->node));
+ assert("edward-2143", ergo(unit_split_pos == 0, coord->unit_pos > 0));
+
+ memset(&idata, 0, sizeof(idata));
+ item_key_by_coord(coord, &item_key);
+ unit_key_by_coord(coord, &split_key);
+ set_key_offset(&split_key,
+ get_key_offset(&split_key) +
+ (unit_split_pos << current_blocksize_bits));
+
+ if (unit_split_pos != 0) {
+ /*
+ * start from splitting the unit.
+ * NOTE: it may change the item @coord (specifically, split
+ * it and move its part to the right neighbor
+ */
+ ret = split_extent_unit(coord, unit_split_pos,
+ 0 /* stay on the original position */);
+ if (ret)
+ return ret;
+ assert("edward-2110",
+ keyeq(&item_key, item_key_by_coord(coord, &check_key)));
+ /*
+ * check if it was the case of item splitting at desired offset
+ * (see the comment above).
+ */
+ if (reiser4_extent_size(coord) ==
+ get_key_offset(&split_key) - get_key_offset(&item_key))
+ /*
+ * item was split at specified offset - nothing to do
+ * any more
+ */
+ return 0;
+ assert("edward-2426", reiser4_extent_size(coord) >
+ get_key_offset(&split_key) - get_key_offset(&item_key));
+ /*
+ * unit at @coord decreased, number of units in the item
+ * got incremented
+ */
+ tail_orig =
+ node_plugin_by_node(coord->node)->item_by_coord(coord) +
+ (coord->unit_pos + 1) * sizeof(reiser4_extent);
+ tail_num_units = coord_num_units(coord) - coord->unit_pos - 1;
+ } else {
+ /*
+ * none of the units is subjected to splitting -
+ * we'll split the item at units boundary.
+ */
+ tail_orig =
+ node_plugin_by_node(coord->node)->item_by_coord(coord) +
+ coord->unit_pos * sizeof(reiser4_extent);
+ tail_num_units = coord_num_units(coord) - coord->unit_pos;
+ }
+ assert("edward-2427", tail_num_units > 0);
+
+ tail_len = tail_num_units * sizeof(reiser4_extent);
+
+ tail_copy = kmalloc(tail_len, reiser4_ctx_gfp_mask_get());
+ if (!tail_copy)
+ return -ENOMEM;
+ memcpy(tail_copy, tail_orig, tail_len);
+ /*
+ * cut off the tail from the original item
+ */
+ coord_dup(&cut_from, coord);
+ if (unit_split_pos)
+ /* the original unit was split */
+ cut_from.unit_pos ++;
+ coord_dup(&cut_to, coord);
+ cut_to.unit_pos = coord_num_units(coord) - 1;
+ /*
+ * cut the original tail
+ */
+ cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
+ /* make sure that @coord is valid after cut operation */
+ if (unit_split_pos == 0)
+ coord->unit_pos --;
+
+ assert("edward-2428",
+ get_key_offset(item_key_by_coord(coord, &check_key)) +
+ reiser4_extent_size(coord) == get_key_offset(&split_key));
+ /*
+ * finally, create a new item
+ */
+ init_new_extent(item_id_by_coord(&cut_from),
+ &idata, tail_copy, tail_num_units);
+ coord_init_after_item(&cut_from);
+
+ ret = insert_by_coord(&cut_from, &idata, &split_key,
+ 0 /* lh */, COPI_DONT_SHIFT_LEFT);
+ kfree(tail_copy);
+ return 0;
+}
+
+static int do_split_extent(struct extent_migrate_context *mctx)
+{
+ int ret;
+ znode *loaded;
+
+ loaded = mctx->coord->node;
+ ret = zload(loaded);
+ if (ret)
+ return ret;
+ ret = split_extent_item(mctx->coord, mctx->unit_split_pos);
+ zrelse(loaded);
+ return ret;
+}
+
+static void init_migration_context(struct extent_migrate_context *mctx,
+ struct inode *inode, coord_t *coord,
+ reiser4_key *key, lock_handle *lh)
+{
+ memset(mctx, 0, sizeof(*mctx));
+ mctx->coord = coord;
+ mctx->key = key;
+ mctx->inode = inode;
+ mctx->lh = lh;
+}
+
+static void reset_migration_context(struct extent_migrate_context *mctx)
+{
+ mctx->act = INVALID_ACTION;
+ mctx->nr_pages = 0;
+ mctx->stop = 0;
+ mctx->unit_split_pos = 0;
+ mctx->blocks_migrated = 0;
+ mctx->migrate_whole_item = 0;
+}
+
+/**
+ * Assign primitive migration operation over the given item
+ * specified by @mctx.coord
+ */
+static void what_to_do(struct extent_migrate_context *mctx, u64 *dst_id)
+{
+ loff_t off1, off2;
+ loff_t split_off;
+
+ coord_t *coord;
+ lookup_result ret;
+ struct inode *inode = mctx->inode;
+ reiser4_key split_key;
+ int skip;
+
+ coord = mctx->coord;
+ zload(coord->node);
+ coord_clear_iplug(coord);
+ reset_migration_context(mctx);
+ /*
+ * calculate offsets of leftmost and rightmost bytes
+ * pointed out by the item
+ */
+ off1 = get_key_offset(mctx->key);
+ off2 = off1 + reiser4_extent_size(coord) - 1;
+ mctx->new_loc =
+ dst_id != NULL ? *dst_id : calc_data_subvol(inode, off2)->id;
+ skip = (mctx->new_loc == get_key_ordering(mctx->key));
+ /*
+ * find split offset in the item, i.e. maximal offset, so that
+ * data bytes at (offset - 1) and (offset) belong to different
+ * bricks in the logical volume with the new configuration.
+ */
+ split_off = off2 - (off2 & (current_stripe_size - 1));
+ while (off1 < split_off) {
+ split_off -= current_stripe_size;
+ if (calc_data_subvol(inode, split_off)->id != mctx->new_loc) {
+ split_off += current_stripe_size;
+ goto split_off_found;
+ }
+ if (!skip && (off2 - split_off + 1 >=
+ MIGRATION_GRANULARITY << PAGE_SHIFT)) {
+ /*
+ * split offset is not found, but the extent
+ * is too large, so we have to migrate a part
+ * of the item
+ */
+ split_off += current_stripe_size;
+ goto split_off_found;
+ }
+ }
+ /*
+ * split offset not found. The whole item is either
+ * to be migrated, or to be skipped
+ */
+ coord->unit_pos = 0;
+ mctx->stop_off = off1;
+ if (skip) {
+ mctx->stop = 1;
+ mctx->act = SKIP_EXTENT;
+ } else {
+ mctx->migrate_whole_item = 1;
+ mctx->act = MIGRATE_EXTENT;
+ }
+ zrelse(coord->node);
+ return;
+ split_off_found:
+ /*
+ * set current position to the found split offset
+ */
+ assert("edward-2112", (off1 < split_off) &&
+ (split_off < off1 + reiser4_extent_size(coord)));
+
+ mctx->stop_off = split_off;
+
+ memcpy(&split_key, mctx->key, sizeof(split_key));
+ set_key_offset(&split_key, split_off);
+ ret = lookup_extent(&split_key, FIND_EXACT, coord);
+
+ assert("edward-2113", ret == CBK_COORD_FOUND);
+ assert("edward-2114", coord->between == AT_UNIT);
+
+ unit_key_by_coord(coord, &split_key);
+
+ assert("edward-2115", get_key_offset(&split_key) <= split_off);
+ mctx->unit_split_pos =
+ (split_off - get_key_offset(&split_key)) >> PAGE_SHIFT;
+
+ zrelse(coord->node);
+ if (skip) {
+ /*
+ * The item to be split, its right part to be
+ * skipped, and the left part to be processed in
+ * the next iteration of migrate_extent().
+ */
+ mctx->act = SPLIT_EXTENT;
+ } else {
+ /*
+ * Only a part of item should be migrated.
+ * In this case the regular split operation is not
+ * needed.
+ */
+ mctx->migrate_whole_item = 0;
+ mctx->act = MIGRATE_EXTENT;
+ }
+ return;
+}
+
+int reiser4_migrate_extent(coord_t *coord, reiser4_key *key,
+ lock_handle *lh, struct inode *inode,
+ loff_t *done_off, u64 *dst_id)
+{
+ int ret = 0;
+ reiser4_block_nr blocks_migrated = 0;
+ struct extent_migrate_context mctx;
+
+ init_migration_context(&mctx, inode, coord, key, lh);
+
+ while (!mctx.stop) {
+ what_to_do(&mctx, dst_id);
+ switch(mctx.act) {
+ case SKIP_EXTENT:
+ ret = zload(mctx.coord->node);
+ if (ret)
+ goto out;
+ try_merge_with_right_item(mctx.coord);
+ zrelse(mctx.coord->node);
+ *done_off = mctx.stop_off;
+ goto out;
+ case SPLIT_EXTENT:
+ ret = do_split_extent(&mctx);
+ if (ret)
+ goto out;
+ continue;
+ case MIGRATE_EXTENT:
+ /*
+ * Resource-intensive. The maximun number of
+ * blocks to migrate at once is determined by
+ * MIGRATION_GRANULARITY
+ */
+ ret = do_migrate_extent(&mctx);
+ if (ret)
+ goto out;
+ assert("edward-2351", mctx.blocks_migrated > 0);
+ *done_off = mctx.done_off;
+
+ blocks_migrated += mctx.blocks_migrated;
+ break;
+ default:
+ impossible("edward-2116",
+ "Bad migrate action id %d", mctx.act);
+ }
+ }
+ out:
+ done_lh(lh);
+ return ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/internal.c linux-5.10.2/fs/reiser4/plugin/item/internal.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/internal.c 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,406 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Implementation of internal-item plugin methods. */
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "internal.h"
+#include "item.h"
+#include "../node/node.h"
+#include "../plugin.h"
+#include "../../jnode.h"
+#include "../../znode.h"
+#include "../../tree_walk.h"
+#include "../../tree_mod.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../../block_alloc.h"
+
+/* see internal.h for explanation */
+
+/* plugin->u.item.b.mergeable */
+int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
+ const coord_t * p2 UNUSED_ARG /* second item */ )
+{
+ /* internal items are not mergeable */
+ return 0;
+}
+
+/* ->lookup() method for internal items */
+lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
+ lookup_bias bias UNUSED_ARG /* lookup bias */ ,
+ coord_t * coord /* coord of item */ )
+{
+ reiser4_key ukey;
+
+ switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
+ default:
+ impossible("", "keycmp()?!");
+ case LESS_THAN:
+ /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
+ item plugin can not be taken using coord set this way */
+ assert("vs-681", coord->unit_pos == 0);
+ coord->between = AFTER_UNIT;
+ /* fall through */
+ case EQUAL_TO:
+ return CBK_COORD_FOUND;
+ case GREATER_THAN:
+ return CBK_COORD_NOTFOUND;
+ }
+}
+
+/* return body of internal item at @coord */
+static internal_item_layout *internal_at(const coord_t * coord /* coord of
+ * item */ )
+{
+ assert("nikita-607", coord != NULL);
+ assert("nikita-1650",
+ item_plugin_by_coord(coord) ==
+ item_plugin_by_id(NODE_POINTER_ID));
+ return (internal_item_layout *) item_body_by_coord(coord);
+}
+
+void reiser4_update_internal(const coord_t * coord,
+ const reiser4_block_nr * blocknr)
+{
+ internal_item_layout *item = internal_at(coord);
+ //assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
+
+ put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
+}
+
+/* return child block number stored in the internal item at @coord */
+static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
+{
+ assert("nikita-608", coord != NULL);
+ return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
+}
+
+/* get znode pointed to by internal @item */
+static znode *znode_at(const coord_t * item /* coord of item */ ,
+ znode * parent /* parent node */ )
+{
+ return child_znode(item, parent, 1, 0);
+}
+
+/* store pointer from internal item into "block". Implementation of
+ ->down_link() method */
+void down_link_internal(const coord_t * coord /* coord of item */ ,
+ const reiser4_key * key UNUSED_ARG /* key to get
+ * pointer for */ ,
+ reiser4_block_nr * block /* resulting block number */ )
+{
+ ON_DEBUG(reiser4_key item_key);
+
+ assert("nikita-609", coord != NULL);
+ assert("nikita-611", block != NULL);
+ assert("nikita-612", (key == NULL) ||
+ /* twig horrors */
+ (znode_get_level(coord->node) == TWIG_LEVEL)
+ || keyle(item_key_by_coord(coord, &item_key), key));
+
+ *block = pointer_at(coord);
+ //assert("nikita-2960", reiser4_blocknr_is_sane(block));
+}
+
+/* Get the child's block number, or 0 if the block is unallocated. */
+int
+utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
+ reiser4_block_nr * block)
+{
+ assert("jmacd-2059", coord != NULL);
+
+ *block = pointer_at(coord);
+ //assert("nikita-2961", reiser4_blocknr_is_sane(block));
+
+ if (reiser4_blocknr_is_fake(block)) {
+ *block = 0;
+ }
+
+ return 0;
+}
+
+/* Return the child. */
+int
+utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
+ jnode ** childp)
+{
+ reiser4_block_nr block = pointer_at(coord);
+ znode *child;
+
+ assert("jmacd-2059", childp != NULL);
+ //assert("nikita-2962", reiser4_blocknr_is_sane(&block));
+
+ child = zlook(znode_get_tree(coord->node), &block);
+
+ if (IS_ERR(child)) {
+ return PTR_ERR(child);
+ }
+
+ *childp = ZJNODE(child);
+
+ return 0;
+}
+
+#if REISER4_DEBUG
+
+static void check_link(znode * left, znode * right)
+{
+ znode *scan;
+
+ for (scan = left; scan != right; scan = scan->right) {
+ if (ZF_ISSET(scan, JNODE_RIP))
+ break;
+ if (znode_is_right_connected(scan) && scan->right != NULL) {
+ if (ZF_ISSET(scan->right, JNODE_RIP))
+ break;
+ assert("nikita-3285",
+ znode_is_left_connected(scan->right));
+ assert("nikita-3265",
+ ergo(scan != left,
+ ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
+ assert("nikita-3284", scan->right->left == scan);
+ } else
+ break;
+ }
+}
+
+int check__internal(const coord_t * coord, const char **error)
+{
+ reiser4_block_nr blk;
+ znode *child;
+ coord_t cpy;
+
+ blk = pointer_at(coord);
+#if 0
+ if (!reiser4_blocknr_is_sane(&blk)) {
+ *error = "Invalid pointer";
+ return -1;
+ }
+#endif
+ coord_dup(&cpy, coord);
+ child = znode_at(&cpy, cpy.node);
+ if (child != NULL) {
+ znode *left_child;
+ znode *right_child;
+
+ left_child = right_child = NULL;
+
+ assert("nikita-3256", znode_invariant(child));
+ if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
+ left_child = znode_at(&cpy, cpy.node);
+ if (left_child != NULL) {
+ read_lock_tree();
+ check_link(left_child, child);
+ read_unlock_tree();
+ zput(left_child);
+ }
+ }
+ coord_dup(&cpy, coord);
+ if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
+ right_child = znode_at(&cpy, cpy.node);
+ if (right_child != NULL) {
+ read_lock_tree();
+ check_link(child, right_child);
+ read_unlock_tree();
+ zput(right_child);
+ }
+ }
+ zput(child);
+ }
+ return 0;
+}
+
+#endif /* REISER4_DEBUG */
+
+/* return true only if this item really points to "block" */
+/* Audited by: green(2002.06.14) */
+int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
+ const reiser4_block_nr * block /* block number to
+ * check */ )
+{
+ assert("nikita-613", coord != NULL);
+ assert("nikita-614", block != NULL);
+
+ return pointer_at(coord) == *block;
+}
+
+/* hook called by ->create_item() method of node plugin after new internal
+ item was just created.
+
+ This is point where pointer to new node is inserted into tree. Initialize
+ parent pointer in child znode, insert child into sibling list and slum.
+
+*/
+int create_hook_internal(const coord_t * item /* coord of item */ ,
+ void *arg /* child's left neighbor, if any */ )
+{
+ znode *child;
+ __u64 child_ptr;
+
+ assert("nikita-1252", item != NULL);
+ assert("nikita-1253", item->node != NULL);
+ assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
+ assert("nikita-1450", item->unit_pos == 0);
+
+ /*
+ * preparing to item insertion build_child_ptr_data sets pointer to
+ * data to be inserted to jnode's blocknr which is in cpu byte
+ * order. Node's create_item simply copied those data. As result we
+ * have child pointer in cpu's byte order. Convert content of internal
+ * item to little endian byte order.
+ */
+ child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
+ reiser4_update_internal(item, &child_ptr);
+
+ child = znode_at(item, item->node);
+ if (child != NULL && !IS_ERR(child)) {
+ znode *left;
+ int result = 0;
+ reiser4_tree *tree;
+
+ left = arg;
+ tree = znode_get_tree(item->node);
+ write_lock_tree();
+ write_lock_dk(tree);
+ assert("nikita-1400", (child->in_parent.node == NULL)
+ || (znode_above_root(child->in_parent.node)));
+ ++item->node->c_count;
+ coord_to_parent_coord(item, &child->in_parent);
+ sibling_list_insert_nolock(child, left);
+
+ assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
+ ZF_CLR(child, JNODE_ORPHAN);
+
+ if ((left != NULL) && !keyeq(znode_get_rd_key(left),
+ znode_get_rd_key(child))) {
+ znode_set_rd_key(child, znode_get_rd_key(left));
+ }
+ write_unlock_dk(tree);
+ write_unlock_tree();
+ zput(child);
+ return result;
+ } else {
+ if (child == NULL)
+ child = ERR_PTR(-EIO);
+ return PTR_ERR(child);
+ }
+}
+
+/* hook called by ->cut_and_kill() method of node plugin just before internal
+ item is removed.
+
+ This is point where empty node is removed from the tree. Clear parent
+ pointer in child, and mark node for pending deletion.
+
+ Node will be actually deleted later and in several installations:
+
+ . when last lock on this node will be released, node will be removed from
+ the sibling list and its lock will be invalidated
+
+ . when last reference to this node will be dropped, bitmap will be updated
+ and node will be actually removed from the memory.
+
+*/
+int kill_hook_internal(const coord_t * item /* coord of item */ ,
+ pos_in_node_t from UNUSED_ARG /* start unit */ ,
+ pos_in_node_t count UNUSED_ARG /* stop unit */ ,
+ struct carry_kill_data *p UNUSED_ARG)
+{
+ znode *child;
+ int result = 0;
+
+ assert("nikita-1222", item != NULL);
+ assert("nikita-1224", from == 0);
+ assert("nikita-1225", count == 1);
+
+ child = znode_at(item, item->node);
+ if (child == NULL)
+ return 0;
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+ result = zload(child);
+ if (result) {
+ zput(child);
+ return result;
+ }
+ if (node_is_empty(child)) {
+ reiser4_tree *tree;
+
+ assert("nikita-1397", znode_is_write_locked(child));
+ assert("nikita-1398", child->c_count == 0);
+ assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
+
+ tree = znode_get_tree(item->node);
+ write_lock_tree();
+ init_parent_coord(&child->in_parent, NULL);
+ --item->node->c_count;
+ write_unlock_tree();
+ } else {
+ warning("nikita-1223",
+ "Cowardly refuse to remove link to non-empty node");
+ result = RETERR(-EIO);
+ }
+ zrelse(child);
+ zput(child);
+ return result;
+}
+
+/* hook called by ->shift() node plugin method when iternal item was just
+ moved from one node to another.
+
+ Update parent pointer in child and c_counts in old and new parent
+
+*/
+int shift_hook_internal(const coord_t * item /* coord of item */ ,
+ unsigned from UNUSED_ARG /* start unit */ ,
+ unsigned count UNUSED_ARG /* stop unit */ ,
+ znode * old_node /* old parent */ )
+{
+ znode *child;
+ znode *new_node;
+ reiser4_tree *tree;
+
+ assert("nikita-1276", item != NULL);
+ assert("nikita-1277", from == 0);
+ assert("nikita-1278", count == 1);
+ assert("nikita-1451", item->unit_pos == 0);
+
+ new_node = item->node;
+ assert("nikita-2132", new_node != old_node);
+ tree = znode_get_tree(item->node);
+ child = child_znode(item, old_node, 1, 0);
+ if (child == NULL)
+ return 0;
+ if (!IS_ERR(child)) {
+ write_lock_tree();
+ ++new_node->c_count;
+ assert("nikita-1395", znode_parent(child) == old_node);
+ assert("nikita-1396", old_node->c_count > 0);
+ coord_to_parent_coord(item, &child->in_parent);
+ assert("nikita-1781", znode_parent(child) == new_node);
+ assert("nikita-1782",
+ check_tree_pointer(item, child) == NS_FOUND);
+ --old_node->c_count;
+ write_unlock_tree();
+ zput(child);
+ return 0;
+ } else
+ return PTR_ERR(child);
+}
+
+/* plugin->u.item.b.max_key_inside - not defined */
+/* plugin->u.item.b.nr_units - item.c:single_unit */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/internal.h linux-5.10.2/fs/reiser4/plugin/item/internal.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/internal.h 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,57 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Internal item contains down-link to the child of the internal/twig
+ node in a tree. It is internal items that are actually used during
+ tree traversal. */
+
+#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
+#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+
+/* on-disk layout of internal item */
+typedef struct internal_item_layout {
+ /* 0 */ reiser4_dblock_nr pointer;
+ /* 4 */
+} internal_item_layout;
+
+struct cut_list;
+
+int mergeable_internal(const coord_t * p1, const coord_t * p2);
+lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
+ coord_t * coord);
+/* store pointer from internal item into "block". Implementation of
+ ->down_link() method */
+extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
+ reiser4_block_nr * block);
+extern int has_pointer_to_internal(const coord_t * coord,
+ const reiser4_block_nr * block);
+extern int create_hook_internal(const coord_t * item, void *arg);
+extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
+ pos_in_node_t count, struct carry_kill_data *);
+extern int shift_hook_internal(const coord_t * item, unsigned from,
+ unsigned count, znode * old_node);
+extern void reiser4_print_internal(const char *prefix, coord_t * coord);
+
+extern int utmost_child_internal(const coord_t * coord, sideof side,
+ jnode ** child);
+int utmost_child_real_block_internal(const coord_t * coord, sideof side,
+ reiser4_block_nr * block);
+
+extern void reiser4_update_internal(const coord_t * coord,
+ const reiser4_block_nr * blocknr);
+/* FIXME: reiserfs has check_internal */
+extern int check__internal(const coord_t * coord, const char **error);
+
+/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/item.c linux-5.10.2/fs/reiser4/plugin/item/item.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/item.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/item.c 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,804 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* definition of item plugins. */
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "../plugin_header.h"
+#include "sde.h"
+#include "internal.h"
+#include "item.h"
+#include "static_stat.h"
+#include "../plugin.h"
+#include "../../znode.h"
+#include "../../tree.h"
+#include "../../context.h"
+#include "ctail.h"
+
+/* return pointer to item body */
+void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
+{
+ assert("nikita-324", coord != NULL);
+ assert("nikita-325", coord->node != NULL);
+ assert("nikita-326", znode_is_loaded(coord->node));
+ assert("nikita-3200", coord->offset == INVALID_OFFSET);
+
+ coord->offset =
+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
+ zdata(coord->node);
+ ON_DEBUG(coord->body_v = coord->node->times_locked);
+}
+
+void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
+{
+ return zdata(coord->node) + coord->offset;
+}
+
+#if REISER4_DEBUG
+
+int item_body_is_valid(const coord_t * coord)
+{
+ return
+ coord->offset ==
+ node_plugin_by_node(coord->node)->item_by_coord(coord) -
+ zdata(coord->node);
+}
+
+#endif
+
+/* return length of item at @coord */
+pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
+{
+ int len;
+
+ assert("nikita-327", coord != NULL);
+ assert("nikita-328", coord->node != NULL);
+ assert("nikita-329", znode_is_loaded(coord->node));
+
+ len = node_plugin_by_node(coord->node)->length_by_coord(coord);
+ return len;
+}
+
+void obtain_item_plugin(const coord_t * coord)
+{
+ assert("nikita-330", coord != NULL);
+ assert("nikita-331", coord->node != NULL);
+ assert("nikita-332", znode_is_loaded(coord->node));
+
+ coord_set_iplug((coord_t *) coord,
+ node_plugin_by_node(coord->node)->
+ plugin_by_coord(coord));
+ assert("nikita-2479",
+ coord_iplug(coord) ==
+ node_plugin_by_node(coord->node)->plugin_by_coord(coord));
+}
+
+/* return id of item */
+/* Audited by: green(2002.06.15) */
+item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
+{
+ assert("vs-539", coord != NULL);
+ assert("vs-538", coord->node != NULL);
+ assert("vs-537", znode_is_loaded(coord->node));
+ assert("vs-536", item_plugin_by_coord(coord) != NULL);
+ assert("vs-540",
+ item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
+
+ return item_id_by_plugin(item_plugin_by_coord(coord));
+}
+
+/* return key of item at @coord */
+/* Audited by: green(2002.06.15) */
+reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
+ reiser4_key * key /* result */ )
+{
+ assert("nikita-338", coord != NULL);
+ assert("nikita-339", coord->node != NULL);
+ assert("nikita-340", znode_is_loaded(coord->node));
+
+ return node_plugin_by_node(coord->node)->key_at(coord, key);
+}
+
+/* this returns max key in the item */
+reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
+ reiser4_key * key /* result */ )
+{
+ coord_t last;
+
+ assert("nikita-338", coord != NULL);
+ assert("nikita-339", coord->node != NULL);
+ assert("nikita-340", znode_is_loaded(coord->node));
+
+ /* make coord pointing to last item's unit */
+ coord_dup(&last, coord);
+ last.unit_pos = coord_num_units(&last) - 1;
+ assert("vs-1560", coord_is_existing_unit(&last));
+
+ max_unit_key_by_coord(&last, key);
+ return key;
+}
+
+/* return key of unit at @coord */
+reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
+ reiser4_key * key /* result */ )
+{
+ assert("nikita-772", coord != NULL);
+ assert("nikita-774", coord->node != NULL);
+ assert("nikita-775", znode_is_loaded(coord->node));
+
+ if (item_plugin_by_coord(coord)->b.unit_key != NULL)
+ return item_plugin_by_coord(coord)->b.unit_key(coord, key);
+ else
+ return item_key_by_coord(coord, key);
+}
+
+/* return the biggest key contained the unit @coord */
+reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
+ reiser4_key * key /* result */ )
+{
+ assert("nikita-772", coord != NULL);
+ assert("nikita-774", coord->node != NULL);
+ assert("nikita-775", znode_is_loaded(coord->node));
+
+ if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
+ return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
+ else
+ return unit_key_by_coord(coord, key);
+}
+
+/* ->max_key_inside() method for items consisting of exactly one key (like
+ stat-data) */
+static reiser4_key *max_key_inside_single_key(const coord_t *
+ coord /* coord of item */ ,
+ reiser4_key *
+ result /* resulting key */ )
+{
+ assert("nikita-604", coord != NULL);
+
+ /* coord -> key is starting key of this item and it has to be already
+ filled in */
+ return unit_key_by_coord(coord, result);
+}
+
+/* ->nr_units() method for items consisting of exactly one unit always */
+pos_in_node_t
+nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
+{
+ return 1;
+}
+
+static int
+paste_no_paste(coord_t * coord UNUSED_ARG,
+ reiser4_item_data * data UNUSED_ARG,
+ carry_plugin_info * info UNUSED_ARG)
+{
+ return 0;
+}
+
+/* default ->fast_paste() method */
+static int
+agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
+{
+ return 1;
+}
+
+int item_can_contain_key(const coord_t * item /* coord of item */ ,
+ const reiser4_key * key /* key to check */ ,
+ const reiser4_item_data * data /* parameters of item
+ * being created */ )
+{
+ item_plugin *iplug;
+ reiser4_key min_key_in_item;
+ reiser4_key max_key_in_item;
+
+ assert("nikita-1658", item != NULL);
+ assert("nikita-1659", key != NULL);
+
+ iplug = item_plugin_by_coord(item);
+ if (iplug->b.can_contain_key != NULL)
+ return iplug->b.can_contain_key(item, key, data);
+ else {
+ assert("nikita-1681", iplug->b.max_key_inside != NULL);
+ item_key_by_coord(item, &min_key_in_item);
+ iplug->b.max_key_inside(item, &max_key_in_item);
+
+ /* can contain key if
+ min_key_in_item <= key &&
+ key <= max_key_in_item
+ */
+ return keyle(&min_key_in_item, key)
+ && keyle(key, &max_key_in_item);
+ }
+}
+
+/* mergeable method for non mergeable items */
+static int
+not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
+{
+ return 0;
+}
+
+/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
+int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
+ const coord_t * i2 /* coord of second item */ )
+{
+ item_plugin *iplug;
+ reiser4_key k1;
+ reiser4_key k2;
+
+ assert("nikita-1336", i1 != NULL);
+ assert("nikita-1337", i2 != NULL);
+
+ iplug = item_plugin_by_coord(i1);
+ assert("nikita-1338", iplug != NULL);
+
+ /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
+ shifting code when nodes are in "suspended" state. */
+ assert("nikita-1663",
+ keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
+
+ if (iplug->b.mergeable != NULL) {
+ return iplug->b.mergeable(i1, i2);
+ } else if (iplug->b.max_key_inside != NULL) {
+ iplug->b.max_key_inside(i1, &k1);
+ item_key_by_coord(i2, &k2);
+
+ /* mergeable if ->max_key_inside() >= key of i2; */
+ return keyge(iplug->b.max_key_inside(i1, &k1),
+ item_key_by_coord(i2, &k2));
+ } else {
+ item_key_by_coord(i1, &k1);
+ item_key_by_coord(i2, &k2);
+
+ return
+ (get_key_locality(&k1) == get_key_locality(&k2)) &&
+ (get_key_objectid(&k1) == get_key_objectid(&k2))
+ && (iplug == item_plugin_by_coord(i2));
+ }
+}
+
+int item_is_extent(const coord_t *item)
+{
+ assert("vs-482", coord_is_existing_item(item));
+ return
+ item_id_by_coord(item) == EXTENT40_POINTER_ID ||
+ item_id_by_coord(item) == EXTENT41_POINTER_ID;
+}
+
+int item_is_tail(const coord_t * item)
+{
+ assert("vs-482", coord_is_existing_item(item));
+ return item_id_by_coord(item) == FORMATTING_ID;
+}
+
+#if REISER4_DEBUG
+
+int item_is_statdata(const coord_t * item)
+{
+ assert("vs-516", coord_is_existing_item(item));
+ return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
+}
+
+int item_is_ctail(const coord_t * item)
+{
+ assert("edward-1816", coord_is_existing_item(item));
+ return item_id_by_coord(item) == CTAIL_ID;
+}
+
+#endif /* REISER4_DEBUG */
+
+static int change_item(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ /* cannot change constituent item (sd, or dir_item) */
+ return RETERR(-EINVAL);
+}
+
+static reiser4_plugin_ops item_plugin_ops = {
+ .init = NULL,
+ .load = NULL,
+ .save_len = NULL,
+ .save = NULL,
+ .change = change_item
+};
+
+item_plugin item_plugins[LAST_ITEM_ID] = {
+ [STATIC_STAT_DATA_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = STATIC_STAT_DATA_ID,
+ .groups = (1 << STAT_DATA_ITEM_TYPE),
+ .pops = &item_plugin_ops,
+ .label = "sd",
+ .desc = "stat-data",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_single_key,
+ .can_contain_key = NULL,
+ .mergeable = not_mergeable,
+ .nr_units = nr_units_single_unit,
+ .lookup = NULL,
+ .init = NULL,
+ .paste = paste_no_paste,
+ .fast_paste = NULL,
+ .can_shift = NULL,
+ .copy_units = NULL,
+ .create_hook = NULL,
+ .kill_hook = NULL,
+ .shift_hook = NULL,
+ .cut_units = NULL,
+ .kill_units = NULL,
+ .unit_key = NULL,
+ .max_unit_key = NULL,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = NULL
+#endif
+ },
+ .f = {
+ .utmost_child = NULL,
+ .utmost_child_real_block = NULL,
+ .update = NULL,
+ .scan = NULL,
+ .convert = NULL
+ },
+ .s = {
+ .sd = {
+ .init_inode = init_inode_static_sd,
+ .save_len = save_len_static_sd,
+ .save = save_static_sd
+ }
+ }
+ },
+ [SIMPLE_DIR_ENTRY_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = SIMPLE_DIR_ENTRY_ID,
+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
+ .pops = &item_plugin_ops,
+ .label = "de",
+ .desc = "directory entry",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_single_key,
+ .can_contain_key = NULL,
+ .mergeable = NULL,
+ .nr_units = nr_units_single_unit,
+ .lookup = NULL,
+ .init = NULL,
+ .paste = NULL,
+ .fast_paste = NULL,
+ .can_shift = NULL,
+ .copy_units = NULL,
+ .create_hook = NULL,
+ .kill_hook = NULL,
+ .shift_hook = NULL,
+ .cut_units = NULL,
+ .kill_units = NULL,
+ .unit_key = NULL,
+ .max_unit_key = NULL,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = NULL
+#endif
+ },
+ .f = {
+ .utmost_child = NULL,
+ .utmost_child_real_block = NULL,
+ .update = NULL,
+ .scan = NULL,
+ .convert = NULL
+ },
+ .s = {
+ .dir = {
+ .extract_key = extract_key_de,
+ .update_key = update_key_de,
+ .extract_name = extract_name_de,
+ .extract_file_type = extract_file_type_de,
+ .add_entry = add_entry_de,
+ .rem_entry = rem_entry_de,
+ .max_name_len = max_name_len_de
+ }
+ }
+ },
+ [COMPOUND_DIR_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = COMPOUND_DIR_ID,
+ .groups = (1 << DIR_ENTRY_ITEM_TYPE),
+ .pops = &item_plugin_ops,
+ .label = "cde",
+ .desc = "compressed directory entry",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_cde,
+ .can_contain_key = can_contain_key_cde,
+ .mergeable = mergeable_cde,
+ .nr_units = nr_units_cde,
+ .lookup = lookup_cde,
+ .init = init_cde,
+ .paste = paste_cde,
+ .fast_paste = agree_to_fast_op,
+ .can_shift = can_shift_cde,
+ .copy_units = copy_units_cde,
+ .create_hook = NULL,
+ .kill_hook = NULL,
+ .shift_hook = NULL,
+ .cut_units = cut_units_cde,
+ .kill_units = kill_units_cde,
+ .unit_key = unit_key_cde,
+ .max_unit_key = unit_key_cde,
+ .estimate = estimate_cde,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = reiser4_check_cde
+#endif
+ },
+ .f = {
+ .utmost_child = NULL,
+ .utmost_child_real_block = NULL,
+ .update = NULL,
+ .scan = NULL,
+ .convert = NULL
+ },
+ .s = {
+ .dir = {
+ .extract_key = extract_key_cde,
+ .update_key = update_key_cde,
+ .extract_name = extract_name_cde,
+ .extract_file_type = extract_file_type_de,
+ .add_entry = add_entry_cde,
+ .rem_entry = rem_entry_cde,
+ .max_name_len = max_name_len_cde
+ }
+ }
+ },
+ [NODE_POINTER_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = NODE_POINTER_ID,
+ .groups = (1 << INTERNAL_ITEM_TYPE),
+ .pops = NULL,
+ .label = "internal",
+ .desc = "internal item",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = NULL,
+ .can_contain_key = NULL,
+ .mergeable = mergeable_internal,
+ .nr_units = nr_units_single_unit,
+ .lookup = lookup_internal,
+ .init = NULL,
+ .paste = NULL,
+ .fast_paste = NULL,
+ .can_shift = NULL,
+ .copy_units = NULL,
+ .create_hook = create_hook_internal,
+ .kill_hook = kill_hook_internal,
+ .shift_hook = shift_hook_internal,
+ .cut_units = NULL,
+ .kill_units = NULL,
+ .unit_key = NULL,
+ .max_unit_key = NULL,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = check__internal
+#endif
+ },
+ .f = {
+ .utmost_child = utmost_child_internal,
+ .utmost_child_real_block =
+ utmost_child_real_block_internal,
+ .update = reiser4_update_internal,
+ .scan = NULL,
+ .convert = NULL
+ },
+ .s = {
+ .internal = {
+ .down_link = down_link_internal,
+ .has_pointer_to = has_pointer_to_internal
+ }
+ }
+ },
+ [EXTENT40_POINTER_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = EXTENT40_POINTER_ID,
+ .groups = (1 << FILE_BODY_ITEM_TYPE),
+ .pops = NULL,
+ .label = "extent40",
+ .desc = "simple extent pointer",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_extent,
+ .can_contain_key = can_contain_key_extent,
+ .mergeable = mergeable_extent40,
+ .nr_units = nr_units_extent,
+ .lookup = lookup_extent,
+ .init = NULL,
+ .paste = paste_extent,
+ .fast_paste = agree_to_fast_op,
+ .can_shift = can_shift_extent,
+ .create_hook = create_hook_extent,
+ .copy_units = copy_units_extent,
+ .kill_hook = kill_hook_extent,
+ .shift_hook = NULL,
+ .cut_units = cut_units_extent,
+ .kill_units = kill_units_extent,
+ .unit_key = unit_key_extent,
+ .max_unit_key = max_unit_key_extent,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = reiser4_check_extent
+#endif
+ },
+ .f = {
+ .utmost_child = utmost_child_extent,
+ .utmost_child_real_block =
+ utmost_child_real_block_extent,
+ .update = NULL,
+ .scan = reiser4_scan_extent,
+ .convert = NULL,
+ },
+ .v = {
+ .migrate = NULL
+ },
+ .s = {
+ .file = {
+ .get_block = get_block_address_extent,
+ .append_key = append_key_extent,
+ .init_coord_extension =
+ init_coord_extension_extent
+ }
+ }
+ },
+ [EXTENT41_POINTER_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = EXTENT41_POINTER_ID,
+ .groups = (1 << FILE_BODY_ITEM_TYPE),
+ .pops = NULL,
+ .label = "extent41",
+ .desc = "distributed extent pointer",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_extent,
+ .can_contain_key = can_contain_key_extent,
+ .mergeable = mergeable_extent41, /* differs */
+ .nr_units = nr_units_extent,
+ .lookup = lookup_extent,
+ .init = NULL,
+ .paste = paste_extent,
+ .fast_paste = agree_to_fast_op,
+ .can_shift = can_shift_extent,
+ .create_hook = create_hook_extent,
+ .copy_units = copy_units_extent,
+ .merge_units = merge_units_extent,
+ .kill_hook = kill_hook_extent,
+ .shift_hook = NULL,
+ .cut_units = cut_units_extent,
+ .kill_units = kill_units_extent,
+ .unit_key = unit_key_extent,
+ .max_unit_key = max_unit_key_extent,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = reiser4_check_extent
+#endif
+ },
+ .f = {
+ .utmost_child = utmost_child_extent,
+ .utmost_child_real_block =
+ utmost_child_real_block_extent,
+ .update = NULL,
+ .scan = reiser4_scan_extent,
+ .convert = NULL,
+ },
+ .v = {
+ .migrate = reiser4_migrate_extent
+ },
+ .s = {
+ .file = {
+ .get_block = get_block_address_extent,
+ .append_key = append_key_extent,
+ .init_coord_extension =
+ init_coord_extension_extent
+ }
+ }
+ },
+ [FORMATTING_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = FORMATTING_ID,
+ .groups = (1 << FILE_BODY_ITEM_TYPE),
+ .pops = NULL,
+ .label = "body",
+ .desc = "body (or tail?) item",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_tail,
+ .can_contain_key = can_contain_key_tail,
+ .mergeable = mergeable_tail,
+ .nr_units = nr_units_tail,
+ .lookup = lookup_tail,
+ .init = NULL,
+ .paste = paste_tail,
+ .fast_paste = agree_to_fast_op,
+ .can_shift = can_shift_tail,
+ .create_hook = NULL,
+ .copy_units = copy_units_tail,
+ .kill_hook = kill_hook_tail,
+ .shift_hook = NULL,
+ .cut_units = cut_units_tail,
+ .kill_units = kill_units_tail,
+ .unit_key = unit_key_tail,
+ .max_unit_key = unit_key_tail,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = NULL
+#endif
+ },
+ .f = {
+ .utmost_child = NULL,
+ .utmost_child_real_block = NULL,
+ .update = NULL,
+ .scan = NULL,
+ .convert = NULL
+ },
+ .s = {
+ .file = {
+ .get_block = get_block_address_tail,
+ .append_key = append_key_tail,
+ .init_coord_extension =
+ init_coord_extension_tail
+ }
+ }
+ },
+ [CTAIL_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = CTAIL_ID,
+ .groups = (1 << FILE_BODY_ITEM_TYPE),
+ .pops = NULL,
+ .label = "ctail",
+ .desc = "cryptcompress tail item",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = max_key_inside_tail,
+ .can_contain_key = can_contain_key_ctail,
+ .mergeable = mergeable_ctail,
+ .nr_units = nr_units_ctail,
+ .lookup = NULL,
+ .init = init_ctail,
+ .paste = paste_ctail,
+ .fast_paste = agree_to_fast_op,
+ .can_shift = can_shift_ctail,
+ .create_hook = create_hook_ctail,
+ .copy_units = copy_units_ctail,
+ .kill_hook = kill_hook_ctail,
+ .shift_hook = shift_hook_ctail,
+ .cut_units = cut_units_ctail,
+ .kill_units = kill_units_ctail,
+ .unit_key = unit_key_tail,
+ .max_unit_key = unit_key_tail,
+ .estimate = estimate_ctail,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = check_ctail
+#endif
+ },
+ .f = {
+ .utmost_child = utmost_child_ctail,
+ /* FIXME-EDWARD: write this */
+ .utmost_child_real_block = NULL,
+ .update = NULL,
+ .scan = scan_ctail,
+ .convert = convert_ctail
+ },
+ .s = {
+ .file = {
+ .get_block = get_block_address_tail,
+ .append_key = append_key_ctail,
+ .init_coord_extension =
+ init_coord_extension_tail
+ }
+ }
+ },
+ [BLACK_BOX_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = BLACK_BOX_ID,
+ .groups = (1 << OTHER_ITEM_TYPE),
+ .pops = NULL,
+ .label = "blackbox",
+ .desc = "black box item",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = NULL,
+ .can_contain_key = NULL,
+ .mergeable = not_mergeable,
+ .nr_units = nr_units_single_unit,
+ /* to need for ->lookup method */
+ .lookup = NULL,
+ .init = NULL,
+ .paste = NULL,
+ .fast_paste = NULL,
+ .can_shift = NULL,
+ .copy_units = NULL,
+ .create_hook = NULL,
+ .kill_hook = NULL,
+ .shift_hook = NULL,
+ .cut_units = NULL,
+ .kill_units = NULL,
+ .unit_key = NULL,
+ .max_unit_key = NULL,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = NULL
+#endif
+ }
+ },
+ [BRICK_SYMBOL_ID] = {
+ .h = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .id = BRICK_SYMBOL_ID,
+ .groups = (1 << OTHER_ITEM_TYPE),
+ .pops = NULL,
+ .label = "brick symbol",
+ .desc = "brick identifier",
+ .linkage = {NULL, NULL}
+ },
+ .b = {
+ .max_key_inside = NULL,
+ .can_contain_key = NULL,
+ .mergeable = not_mergeable,
+ .nr_units = nr_units_single_unit,
+ .lookup = NULL,
+ .init = NULL,
+ .paste = NULL,
+ .fast_paste = NULL,
+ .can_shift = NULL,
+ .copy_units = NULL,
+ .create_hook = NULL,
+ .kill_hook = NULL,
+ .shift_hook = NULL,
+ .cut_units = NULL,
+ .kill_units = NULL,
+ .unit_key = NULL,
+ .max_unit_key = NULL,
+ .estimate = NULL,
+ .item_data_by_flow = NULL,
+#if REISER4_DEBUG
+ .check = NULL
+#endif
+ }
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/item.h linux-5.10.2/fs/reiser4/plugin/item/item.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/item.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/item.h 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,414 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* first read balance.c comments before reading this */
+
+/* An item_plugin implements all of the operations required for
+ balancing that are item specific. */
+
+/* an item plugin also implements other operations that are specific to that
+ item. These go into the item specific operations portion of the item
+ handler, and all of the item specific portions of the item handler are put
+ into a union. */
+
+#if !defined( __REISER4_ITEM_H__ )
+#define __REISER4_ITEM_H__
+
+#include "../../forward.h"
+#include "../plugin_header.h"
+#include "../../dformat.h"
+#include "../../seal.h"
+#include "../../plugin/file/file.h"
+
+#include <linux/fs.h> /* for struct file, struct inode */
+#include <linux/mm.h> /* for struct page */
+#include <linux/dcache.h> /* for struct dentry */
+
+typedef enum {
+ STAT_DATA_ITEM_TYPE,
+ DIR_ENTRY_ITEM_TYPE,
+ INTERNAL_ITEM_TYPE,
+ FILE_BODY_ITEM_TYPE,
+ OTHER_ITEM_TYPE
+} item_type_id;
+
+/* this is the part of each item plugin that all items are expected to
+ support or at least explicitly fail to support by setting the
+ pointer to null. */
+struct balance_ops {
+ /* operations called by balancing
+
+ It is interesting to consider that some of these item
+ operations could be given sources or targets that are not
+ really items in nodes. This could be ok/useful.
+
+ */
+ /* maximal key that can _possibly_ be occupied by this item
+
+ When inserting, and node ->lookup() method (called by
+ coord_by_key()) reaches an item after binary search,
+ the ->max_key_inside() item plugin method is used to determine
+ whether new item should pasted into existing item
+ (new_key<=max_key_inside()) or new item has to be created
+ (new_key>max_key_inside()).
+
+ For items that occupy exactly one key (like stat-data)
+ this method should return this key. For items that can
+ grow indefinitely (extent, directory item) this should
+ return reiser4_max_key().
+
+ For example extent with the key
+
+ (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
+
+ ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
+ */
+ reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
+
+ /* true if item @coord can merge data at @key. */
+ int (*can_contain_key) (const coord_t *, const reiser4_key *,
+ const reiser4_item_data *);
+ /* mergeable() - check items for mergeability
+
+ Optional method. Returns true if two items can be merged.
+
+ */
+ int (*mergeable) (const coord_t *, const coord_t *);
+
+ /* number of atomic things in an item.
+ NOTE FOR CONTRIBUTORS: use a generic method
+ nr_units_single_unit() for solid (atomic) items, as
+ tree operations use it as a criterion of solidness
+ (see is_solid_item macro) */
+ pos_in_node_t(*nr_units) (const coord_t *);
+
+ /* search within item for a unit within the item, and return a
+ pointer to it. This can be used to calculate how many
+ bytes to shrink an item if you use pointer arithmetic and
+ compare to the start of the item body if the item's data
+ are continuous in the node, if the item's data are not
+ continuous in the node, all sorts of other things are maybe
+ going to break as well. */
+ lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
+ /* method called by ode_plugin->create_item() to initialise new
+ item */
+ int (*init) (coord_t * target, coord_t * from,
+ reiser4_item_data * data);
+ /* method called (e.g., by reiser4_resize_item()) to place new data
+ into item when it grows */
+ int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
+ /* return true if paste into @coord is allowed to skip
+ carry. That is, if such paste would require any changes
+ at the parent level
+ */
+ int (*fast_paste) (const coord_t *);
+ /* how many but not more than @want units of @source can be
+ shifted into @target node. If pend == append - we try to
+ append last item of @target by first units of @source. If
+ pend == prepend - we try to "prepend" first item in @target
+ by last units of @source. @target node has @free_space
+ bytes of free space. Total size of those units are returned
+ via @size.
+
+ @target is not NULL if shifting to the mergeable item and
+ NULL is new item will be created during shifting.
+ */
+ int (*can_shift) (unsigned free_space, coord_t *,
+ znode *, shift_direction, unsigned *size,
+ unsigned want);
+
+ /* starting off @from-th unit of item @source append or
+ prepend @count units to @target. @target has been already
+ expanded by @free_space bytes. That must be exactly what is
+ needed for those items in @target. If @where_is_free_space
+ == SHIFT_LEFT - free space is at the end of @target item,
+ othersize - it is in the beginning of it. */
+ void (*copy_units) (coord_t *, coord_t *,
+ unsigned from, unsigned count,
+ shift_direction where_is_free_space,
+ unsigned free_space);
+ /*
+ * try to merge rightmost unit of item @left with the leftmost
+ * unit of neighboring item @right located on the same node.
+ * Return freed space. Must not fail. For performance reasons
+ * this method leaves the node in inconsistent state and should
+ * be called only in the context of ->merge_items() method of
+ * node plugin. Must not be called in other ones.*/
+ size_t (*merge_units) (coord_t *left, coord_t *right);
+
+ int (*create_hook) (const coord_t *, void *);
+ /* do whatever is necessary to do when @count units starting
+ from @from-th one are removed from the tree */
+ /* FIXME-VS: this is used to be here for, in particular,
+ extents and items of internal type to free blocks they point
+ to at the same time with removing items from a
+ tree. Problems start, however, when dealloc_block fails due
+ to some reason. Item gets removed, but blocks it pointed to
+ are not freed. It is not clear how to fix this for items of
+ internal type because a need to remove internal item may
+ appear in the middle of balancing, and there is no way to
+ undo changes made. OTOH, if space allocator involves
+ balancing to perform dealloc_block - this will probably
+ break balancing due to deadlock issues
+ */
+ int (*kill_hook) (const coord_t *, pos_in_node_t from,
+ pos_in_node_t count, struct carry_kill_data *);
+ int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
+ znode * _node);
+
+ /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
+ including boundaries. When units are cut from item beginning - move space which gets freed to head of
+ item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
+ item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
+ @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
+ */
+ int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *,
+ reiser4_key * smallest_removed,
+ reiser4_key * new_first_key);
+
+ /* like cut_units, except that these units are removed from the
+ tree, not only from a node */
+ int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *,
+ reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+
+ /* if @key_of_coord == 1 - returned key of coord, otherwise -
+ key of unit is returned. If @coord is not set to certain
+ unit - ERR_PTR(-ENOENT) is returned */
+ reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
+ reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
+ /* estimate how much space is needed for paste @data into item at
+ @coord. if @coord==0 - estimate insertion, otherwise - estimate
+ pasting
+ */
+ int (*estimate) (const coord_t *, const reiser4_item_data *);
+
+ /* converts flow @f to item data. @coord == 0 on insert */
+ int (*item_data_by_flow) (const coord_t *, const flow_t *,
+ reiser4_item_data *);
+
+ /*void (*show) (struct seq_file *, coord_t *); */
+
+#if REISER4_DEBUG
+ /* used for debugging, every item should have here the most
+ complete possible check of the consistency of the item that
+ the inventor can construct */
+ int (*check) (const coord_t *, const char **error);
+#endif
+
+};
+
+struct flush_ops {
+ /* return the right or left child of @coord, only if it is in memory */
+ int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
+
+ /* return whether the right or left child of @coord has a non-fake
+ block number. */
+ int (*utmost_child_real_block) (const coord_t *, sideof side,
+ reiser4_block_nr *);
+ /* relocate child at @coord to the @block */
+ void (*update) (const coord_t *, const reiser4_block_nr *);
+ /* count unformatted nodes per item for leave relocation policy, etc.. */
+ int (*scan) (flush_scan * scan);
+ /* convert item by flush */
+ int (*convert) (flush_pos_t * pos);
+};
+
+struct volume_ops{
+ /*
+ * migrate unformatted blocks pointed out by item at @coord, starting
+ * from right to left. Upon sucessfull completion @done_off contains
+ * offset of the leftmost processed byte. @key is the key of the item
+ */
+ int (*migrate)(coord_t *coord, reiser4_key *key,
+ lock_handle *lh, struct inode *inode, loff_t *done_off,
+ u64 *dst_id);
+};
+
+/* operations specific to the directory item */
+struct dir_entry_iops {
+ /* extract stat-data key from directory entry at @coord and place it
+ into @key. */
+ int (*extract_key) (const coord_t *, reiser4_key * key);
+ /* update object key in item. */
+ int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
+ /* extract name from directory entry at @coord and return it */
+ char *(*extract_name) (const coord_t *, char *buf);
+ /* extract file type (DT_* stuff) from directory entry at @coord and
+ return it */
+ unsigned (*extract_file_type) (const coord_t *);
+ int (*add_entry) (struct inode * dir,
+ coord_t *, lock_handle *,
+ const struct dentry * name,
+ reiser4_dir_entry_desc * entry);
+ int (*rem_entry) (struct inode * dir, const struct qstr * name,
+ coord_t *, lock_handle *,
+ reiser4_dir_entry_desc * entry);
+ int (*max_name_len) (const struct inode * dir);
+};
+
+/*
+ * item specific methods called by regular file plugins
+ */
+struct file_iops{
+ int (*get_block) (const coord_t *, sector_t, sector_t *);
+ /*
+ * key of first byte which is not addressed by the item
+ * @coord is set to, but is mergeable with that item.
+ *
+ * For example, for extent item with the key
+ *
+ * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
+ *
+ * ->append_key is
+ *
+ * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
+ */
+ reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
+ void (*init_coord_extension) (uf_coord_t *, loff_t);
+};
+
+/* operations specific to items of stat data type */
+struct sd_iops {
+ int (*init_inode) (struct inode * inode, char *sd, int len);
+ int (*save_len) (struct inode * inode);
+ int (*save) (struct inode * inode, char **area);
+};
+
+/* operations specific to internal item */
+struct internal_iops{
+ /* all tree traversal want to know from internal item is where
+ to go next. */
+ void (*down_link) (const coord_t * coord,
+ const reiser4_key * key, reiser4_block_nr * block);
+ /* check that given internal item contains given pointer. */
+ int (*has_pointer_to) (const coord_t * coord,
+ const reiser4_block_nr * block);
+};
+
+struct item_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* methods common for all item types */
+ struct balance_ops b; /* balance operations */
+ struct flush_ops f; /* flush operates with items via this methods */
+ struct volume_ops v; /* volume operations */
+
+ /* methods specific to particular type of item */
+ union {
+ struct dir_entry_iops dir;
+ struct file_iops file;
+ struct sd_iops sd;
+ struct internal_iops internal;
+ } s;
+};
+
+#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
+
+static inline item_id item_id_by_plugin(item_plugin * plugin)
+{
+ return plugin->h.id;
+}
+
+static inline char get_iplugid(item_plugin * iplug)
+{
+ assert("nikita-2838", iplug != NULL);
+ assert("nikita-2839", iplug->h.id < 0xff);
+ return (char)item_id_by_plugin(iplug);
+}
+
+extern unsigned long znode_times_locked(const znode * z);
+
+static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
+{
+ assert("nikita-2837", coord != NULL);
+ assert("nikita-2838", iplug != NULL);
+ coord->iplugid = get_iplugid(iplug);
+ ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
+}
+
+static inline item_plugin *coord_iplug(const coord_t * coord)
+{
+ assert("nikita-2833", coord != NULL);
+ assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
+ assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
+ return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
+ coord->iplugid);
+}
+
+extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
+ const reiser4_item_data *);
+extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
+extern int item_is_extent(const coord_t *);
+extern int item_is_tail(const coord_t *);
+extern int item_is_statdata(const coord_t * item);
+extern int item_is_ctail(const coord_t *);
+
+extern pos_in_node_t item_length_by_coord(const coord_t * coord);
+extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
+extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
+extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
+extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
+extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
+extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
+ reiser4_key * key);
+extern void obtain_item_plugin(const coord_t * coord);
+
+#if defined(REISER4_DEBUG)
+extern int znode_is_loaded(const znode * node);
+#endif
+
+/* return plugin of item at @coord */
+static inline item_plugin *item_plugin_by_coord(const coord_t *
+ coord /* coord to query */ )
+{
+ assert("nikita-330", coord != NULL);
+ assert("nikita-331", coord->node != NULL);
+ assert("nikita-332", znode_is_loaded(coord->node));
+
+ if (unlikely(!coord_is_iplug_set(coord)))
+ obtain_item_plugin(coord);
+ return coord_iplug(coord);
+}
+
+/* this returns true if item is of internal type */
+static inline int item_is_internal(const coord_t * item)
+{
+ assert("vs-483", coord_is_existing_item(item));
+ return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
+}
+
+extern void item_body_by_coord_hard(coord_t * coord);
+extern void *item_body_by_coord_easy(const coord_t * coord);
+#if REISER4_DEBUG
+extern int item_body_is_valid(const coord_t * coord);
+#endif
+
+/* return pointer to item body */
+static inline void *item_body_by_coord(const coord_t *
+ coord /* coord to query */ )
+{
+ assert("nikita-324", coord != NULL);
+ assert("nikita-325", coord->node != NULL);
+ assert("nikita-326", znode_is_loaded(coord->node));
+
+ if (coord->offset == INVALID_OFFSET)
+ item_body_by_coord_hard((coord_t *) coord);
+ assert("nikita-3201", item_body_is_valid(coord));
+ assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
+ return item_body_by_coord_easy(coord);
+}
+
+/* __REISER4_ITEM_H__ */
+#endif
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/Makefile linux-5.10.2/fs/reiser4/plugin/item/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/item/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/Makefile 2020-12-23 16:07:46.128813275 +0100
@@ -0,0 +1,17 @@
+obj-$(CONFIG_REISER4_FS) += item_plugins.o
+
+item_plugins-objs := \
+ item.o \
+ static_stat.o \
+ sde.o \
+ cde.o \
+ blackbox.o \
+ brick_symbol.o \
+ internal.o \
+ tail.o \
+ ctail.o \
+ extent.o \
+ extent_item_ops.o \
+ extent_file_ops.o \
+ extent_stripe_ops.o \
+ extent_flush_ops.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/sde.c linux-5.10.2/fs/reiser4/plugin/item/sde.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/sde.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/sde.c 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,186 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Directory entry implementation */
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../coord.h"
+#include "sde.h"
+#include "item.h"
+#include "../plugin.h"
+#include "../../znode.h"
+#include "../../carry.h"
+#include "../../tree.h"
+#include "../../inode.h"
+
+#include <linux/fs.h> /* for struct inode */
+#include <linux/dcache.h> /* for struct dentry */
+
+/* ->extract_key() method of simple directory item plugin. */
+int extract_key_de(const coord_t * coord /* coord of item */ ,
+ reiser4_key * key /* resulting key */ )
+{
+ directory_entry_format *dent;
+
+ assert("nikita-1458", coord != NULL);
+ assert("nikita-1459", key != NULL);
+
+ dent = (directory_entry_format *) item_body_by_coord(coord);
+ assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
+ return extract_key_from_id(&dent->id, key);
+}
+
+int
+update_key_de(const coord_t * coord, const reiser4_key * key,
+ lock_handle * lh UNUSED_ARG)
+{
+ directory_entry_format *dent;
+ obj_key_id obj_id;
+ int result;
+
+ assert("nikita-2342", coord != NULL);
+ assert("nikita-2343", key != NULL);
+
+ dent = (directory_entry_format *) item_body_by_coord(coord);
+ result = build_obj_key_id(key, &obj_id);
+ if (result == 0) {
+ dent->id = obj_id;
+ znode_make_dirty(coord->node);
+ }
+ return 0;
+}
+
+char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
+ char *buf)
+{
+ reiser4_key key;
+
+ unit_key_by_coord(coord, &key);
+ if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
+ reiser4_print_address("oops", znode_get_block(coord->node));
+ if (!is_longname_key(&key)) {
+ if (is_dot_key(&key))
+ return (char *)".";
+ else
+ return extract_name_from_key(&key, buf);
+ } else
+ return (char *)dent->name;
+}
+
+/* ->extract_name() method of simple directory item plugin. */
+char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
+{
+ directory_entry_format *dent;
+
+ assert("nikita-1460", coord != NULL);
+
+ dent = (directory_entry_format *) item_body_by_coord(coord);
+ return extract_dent_name(coord, dent, buf);
+}
+
+/* ->extract_file_type() method of simple directory item plugin. */
+unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
+ * item */ )
+{
+ assert("nikita-1764", coord != NULL);
+ /* we don't store file type in the directory entry yet.
+
+ But see comments at kassign.h:obj_key_id
+ */
+ return DT_UNKNOWN;
+}
+
+int add_entry_de(struct inode *dir /* directory of item */ ,
+ coord_t * coord /* coord of item */ ,
+ lock_handle * lh /* insertion lock handle */ ,
+ const struct dentry *de /* name to add */ ,
+ reiser4_dir_entry_desc * entry /* parameters of new directory
+ * entry */ )
+{
+ reiser4_item_data data;
+ directory_entry_format *dent;
+ int result;
+ const char *name;
+ int len;
+ int longname;
+
+ name = de->d_name.name;
+ len = de->d_name.len;
+ assert("nikita-1163", strlen(name) == len);
+
+ longname = is_longname(name, len);
+
+ data.length = sizeof *dent;
+ if (longname)
+ data.length += len + 1;
+ data.data = NULL;
+ data.user = 0;
+ data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
+
+ inode_add_bytes(dir, data.length);
+
+ result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
+ if (result != 0)
+ return result;
+
+ dent = (directory_entry_format *) item_body_by_coord(coord);
+ build_inode_key_id(entry->obj, &dent->id);
+ if (longname) {
+ memcpy(dent->name, name, len);
+ put_unaligned(0, &dent->name[len]);
+ }
+ return 0;
+}
+
+int rem_entry_de(struct inode *dir /* directory of item */ ,
+ const struct qstr *name UNUSED_ARG,
+ coord_t * coord /* coord of item */ ,
+ lock_handle * lh UNUSED_ARG /* lock handle for
+ * removal */ ,
+ reiser4_dir_entry_desc * entry UNUSED_ARG /* parameters of
+ * directory entry
+ * being removed */ )
+{
+ coord_t shadow;
+ int result;
+ int length;
+
+ length = item_length_by_coord(coord);
+ if (inode_get_bytes(dir) < length) {
+ warning("nikita-2627", "Dir is broke: %llu: %llu",
+ (unsigned long long)get_inode_oid(dir),
+ inode_get_bytes(dir));
+
+ return RETERR(-EIO);
+ }
+
+ /* cut_node() is supposed to take pointers to _different_
+ coords, because it will modify them without respect to
+ possible aliasing. To work around this, create temporary copy
+ of @coord.
+ */
+ coord_dup(&shadow, coord);
+ result =
+ kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
+ if (result == 0) {
+ inode_sub_bytes(dir, length);
+ }
+ return result;
+}
+
+int max_name_len_de(const struct inode *dir)
+{
+ return meta_subvol_tree()->nplug->max_item_size() -
+ sizeof(directory_entry_format) - 2;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/sde.h linux-5.10.2/fs/reiser4/plugin/item/sde.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/sde.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/sde.h 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,66 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Directory entry. */
+
+#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
+#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "../../kassign.h"
+#include "../../key.h"
+
+#include <linux/fs.h>
+#include <linux/dcache.h> /* for struct dentry */
+
+typedef struct directory_entry_format {
+ /* key of object stat-data. It's not necessary to store whole
+ key here, because it's always key of stat-data, so minor
+ packing locality and offset can be omitted here. But this
+ relies on particular key allocation scheme for stat-data, so,
+ for extensibility sake, whole key can be stored here.
+
+ We store key as array of bytes, because we don't want 8-byte
+ alignment of dir entries.
+ */
+ obj_key_id id;
+ /* file name. Null terminated string. */
+ d8 name[0];
+} directory_entry_format;
+
+void print_de(const char *prefix, coord_t * coord);
+int extract_key_de(const coord_t * coord, reiser4_key * key);
+int update_key_de(const coord_t * coord, const reiser4_key * key,
+ lock_handle * lh);
+char *extract_name_de(const coord_t * coord, char *buf);
+unsigned extract_file_type_de(const coord_t * coord);
+int add_entry_de(struct inode *dir, coord_t * coord,
+ lock_handle * lh, const struct dentry *name,
+ reiser4_dir_entry_desc * entry);
+int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
+ lock_handle * lh, reiser4_dir_entry_desc * entry);
+int max_name_len_de(const struct inode *dir);
+
+int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
+
+char *extract_dent_name(const coord_t * coord,
+ directory_entry_format * dent, char *buf);
+
+#if REISER4_LARGE_KEY
+#define DE_NAME_BUF_LEN (24)
+#else
+#define DE_NAME_BUF_LEN (16)
+#endif
+
+/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/static_stat.c linux-5.10.2/fs/reiser4/plugin/item/static_stat.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/static_stat.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/static_stat.c 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,1113 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* stat data manipulation. */
+
+#include "../../forward.h"
+#include "../../super.h"
+#include "../../vfs_ops.h"
+#include "../../inode.h"
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../object.h"
+#include "../plugin.h"
+#include "../plugin_header.h"
+#include "static_stat.h"
+#include "item.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/* see static_stat.h for explanation */
+
+/* helper function used while we are dumping/loading inode/plugin state
+ to/from the stat-data. */
+
+static void move_on(int *length /* space remaining in stat-data */ ,
+ char **area /* current coord in stat data */ ,
+ int size_of /* how many bytes to move forward */ )
+{
+ assert("nikita-615", length != NULL);
+ assert("nikita-616", area != NULL);
+
+ *length -= size_of;
+ *area += size_of;
+
+ assert("nikita-617", *length >= 0);
+}
+
+/* helper function used while loading inode/plugin state from stat-data.
+ Complain if there is less space in stat-data than was expected.
+ Can only happen on disk corruption. */
+static int not_enough_space(struct inode *inode /* object being processed */ ,
+ const char *where /* error message */ )
+{
+ assert("nikita-618", inode != NULL);
+
+ warning("nikita-619", "Not enough space in %llu while loading %s",
+ (unsigned long long)get_inode_oid(inode), where);
+
+ return RETERR(-EINVAL);
+}
+
+/* helper function used while loading inode/plugin state from
+ stat-data. Call it if invalid plugin id was found. */
+static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
+ struct inode *inode /* object being processed */ )
+{
+ warning("nikita-620", "Unknown plugin %i in %llu",
+ id, (unsigned long long)get_inode_oid(inode));
+
+ return RETERR(-EINVAL);
+}
+
+/* this is installed as ->init_inode() method of
+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
+ Copies data from on-disk stat-data format into inode.
+ Handles stat-data extensions. */
+/* was sd_load */
+int init_inode_static_sd(struct inode *inode /* object being processed */ ,
+ char *sd /* stat-data body */ ,
+ int len /* length of stat-data */ )
+{
+ int result;
+ int bit;
+ int chunk;
+ __u16 mask;
+ __u64 bigmask;
+ reiser4_stat_data_base *sd_base;
+ reiser4_inode *state;
+
+ assert("nikita-625", inode != NULL);
+ assert("nikita-626", sd != NULL);
+
+ result = 0;
+ sd_base = (reiser4_stat_data_base *) sd;
+ state = reiser4_inode_data(inode);
+ mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
+ bigmask = mask;
+ reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
+
+ move_on(&len, &sd, sizeof *sd_base);
+ for (bit = 0, chunk = 0;
+ mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
+ ++bit, mask >>= 1) {
+ if (((bit + 1) % 16) != 0) {
+ /* handle extension */
+ sd_ext_plugin *sdplug;
+
+ if (bit >= LAST_SD_EXTENSION) {
+ warning("vpf-1904",
+ "No such extension %i in inode %llu",
+ bit,
+ (unsigned long long)
+ get_inode_oid(inode));
+
+ result = RETERR(-EINVAL);
+ break;
+ }
+
+ sdplug = sd_ext_plugin_by_id(bit);
+ if (sdplug == NULL) {
+ warning("nikita-627",
+ "No such extension %i in inode %llu",
+ bit,
+ (unsigned long long)
+ get_inode_oid(inode));
+
+ result = RETERR(-EINVAL);
+ break;
+ }
+ if (mask & 1) {
+ assert("nikita-628", sdplug->present);
+ /* alignment is not supported in node layout
+ plugin yet.
+ result = align( inode, &len, &sd,
+ sdplug -> alignment );
+ if( result != 0 )
+ return result; */
+ result = sdplug->present(inode, &sd, &len);
+ } else if (sdplug->absent != NULL)
+ result = sdplug->absent(inode);
+ if (result)
+ break;
+ /* else, we are looking at the last bit in 16-bit
+ portion of bitmask */
+ } else if (mask & 1) {
+ /* next portion of bitmask */
+ if (len < (int)sizeof(d16)) {
+ warning("nikita-629",
+ "No space for bitmap in inode %llu",
+ (unsigned long long)
+ get_inode_oid(inode));
+
+ result = RETERR(-EINVAL);
+ break;
+ }
+ mask = le16_to_cpu(get_unaligned((d16 *)sd));
+ bigmask <<= 16;
+ bigmask |= mask;
+ move_on(&len, &sd, sizeof(d16));
+ ++chunk;
+ if (chunk == 3) {
+ if (!(mask & 0x8000)) {
+ /* clear last bit */
+ mask &= ~0x8000;
+ continue;
+ }
+ /* too much */
+ warning("nikita-630",
+ "Too many extensions in %llu",
+ (unsigned long long)
+ get_inode_oid(inode));
+
+ result = RETERR(-EINVAL);
+ break;
+ }
+ } else
+ /* bitmask exhausted */
+ break;
+ }
+ state->extmask = bigmask;
+ /* common initialisations */
+ if (len - (bit / 16 * sizeof(d16)) > 0) {
+ /* alignment in save_len_static_sd() is taken into account
+ -edward */
+ warning("nikita-631", "unused space in inode %llu",
+ (unsigned long long)get_inode_oid(inode));
+ }
+
+ return result;
+}
+
+/* estimates size of stat-data required to store inode.
+ Installed as ->save_len() method of
+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
+/* was sd_len */
+int save_len_static_sd(struct inode *inode /* object being processed */ )
+{
+ unsigned int result;
+ __u64 mask;
+ int bit;
+
+ assert("nikita-632", inode != NULL);
+
+ result = sizeof(reiser4_stat_data_base);
+ mask = reiser4_inode_data(inode)->extmask;
+ for (bit = 0; mask != 0; ++bit, mask >>= 1) {
+ if (mask & 1) {
+ sd_ext_plugin *sdplug;
+
+ sdplug = sd_ext_plugin_by_id(bit);
+ assert("nikita-633", sdplug != NULL);
+ /*
+ no aligment support
+ result +=
+ reiser4_round_up(result, sdplug -> alignment) -
+ result;
+ */
+ result += sdplug->save_len(inode);
+ }
+ }
+ result += bit / 16 * sizeof(d16);
+ return result;
+}
+
+/* saves inode into stat-data.
+ Installed as ->save() method of
+ item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
+/* was sd_save */
+int save_static_sd(struct inode *inode /* object being processed */ ,
+ char **area /* where to save stat-data */ )
+{
+ int result;
+ __u64 emask;
+ int bit;
+ unsigned int len;
+ reiser4_stat_data_base *sd_base;
+
+ assert("nikita-634", inode != NULL);
+ assert("nikita-635", area != NULL);
+
+ result = 0;
+ emask = reiser4_inode_data(inode)->extmask;
+ sd_base = (reiser4_stat_data_base *) * area;
+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
+ /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
+
+ *area += sizeof *sd_base;
+ len = 0xffffffffu;
+ for (bit = 0; emask != 0; ++bit, emask >>= 1) {
+ if (emask & 1) {
+ if ((bit + 1) % 16 != 0) {
+ sd_ext_plugin *sdplug;
+ sdplug = sd_ext_plugin_by_id(bit);
+ assert("nikita-636", sdplug != NULL);
+ /* no alignment support yet
+ align( inode, &len, area,
+ sdplug -> alignment ); */
+ result = sdplug->save(inode, area);
+ if (result)
+ break;
+ } else {
+ put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
+ (d16 *)(*area));
+ /*cputod16((unsigned)(emask & 0xffff),
+ (d16 *) * area);*/
+ *area += sizeof(d16);
+ }
+ }
+ }
+ return result;
+}
+
+/* stat-data extension handling functions. */
+
+static int present_lw_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ ,
+ int *len /* remaining length */ )
+{
+ if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
+ reiser4_light_weight_stat *sd_lw;
+
+ sd_lw = (reiser4_light_weight_stat *) * area;
+
+ inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
+ set_nlink(inode, le32_to_cpu(get_unaligned(&sd_lw->nlink)));
+ inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
+ if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
+ inode->i_mode &= ~S_IFIFO;
+ reiser4_inode_set_flag(inode,
+ REISER4_FAKE_IMODE_ONDISK);
+ }
+ move_on(len, area, sizeof *sd_lw);
+ return 0;
+ } else
+ return not_enough_space(inode, "lw sd");
+}
+
+static int save_len_lw_sd(struct inode *inode UNUSED_ARG /* object being
+ * processed */ )
+{
+ return sizeof(reiser4_light_weight_stat);
+}
+
+static int save_lw_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ )
+{
+ reiser4_light_weight_stat *sd;
+ mode_t delta;
+
+ assert("nikita-2705", inode != NULL);
+ assert("nikita-2706", area != NULL);
+ assert("nikita-2707", *area != NULL);
+
+ sd = (reiser4_light_weight_stat *) * area;
+
+ delta = (reiser4_inode_get_flag(inode,
+ REISER4_FAKE_IMODE_ONDISK) ? S_IFIFO : 0);
+ put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
+ put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
+ put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
+ *area += sizeof *sd;
+ return 0;
+}
+
+static int present_unix_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ ,
+ int *len /* remaining length */ )
+{
+ assert("nikita-637", inode != NULL);
+ assert("nikita-638", area != NULL);
+ assert("nikita-639", *area != NULL);
+ assert("nikita-640", len != NULL);
+ assert("nikita-641", *len > 0);
+
+ if (*len >= (int)sizeof(reiser4_unix_stat)) {
+ reiser4_unix_stat *sd;
+
+ sd = (reiser4_unix_stat *) * area;
+
+ i_uid_write(inode, le32_to_cpu(get_unaligned(&sd->uid)));
+ i_gid_write(inode, le32_to_cpu(get_unaligned(&sd->gid)));
+ inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
+ inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
+ inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+ inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
+ else
+ inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
+ move_on(len, area, sizeof *sd);
+ return 0;
+ } else
+ return not_enough_space(inode, "unix sd");
+}
+
+static int absent_unix_sd(struct inode *inode /* object being processed */ )
+{
+ i_uid_write(inode, get_super_private(inode->i_sb)->default_uid);
+ i_gid_write(inode, get_super_private(inode->i_sb)->default_gid);
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode_set_bytes(inode, inode->i_size);
+ /* mark inode as lightweight, so that caller (lookup_common) will
+ complete initialisation by copying [ug]id from a parent. */
+ reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
+ return 0;
+}
+
+/* Audited by: green(2002.06.14) */
+static int save_len_unix_sd(struct inode *inode UNUSED_ARG /* object being
+ * processed */ )
+{
+ return sizeof(reiser4_unix_stat);
+}
+
+static int save_unix_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ )
+{
+ reiser4_unix_stat *sd;
+
+ assert("nikita-642", inode != NULL);
+ assert("nikita-643", area != NULL);
+ assert("nikita-644", *area != NULL);
+
+ sd = (reiser4_unix_stat *) * area;
+ put_unaligned(cpu_to_le32(i_uid_read(inode)), &sd->uid);
+ put_unaligned(cpu_to_le32(i_gid_read(inode)), &sd->gid);
+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
+ if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+ put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
+ else
+ put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
+ *area += sizeof *sd;
+ return 0;
+}
+
+static int
+present_large_times_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ ,
+ int *len /* remaining length */ )
+{
+ if (*len >= (int)sizeof(reiser4_large_times_stat)) {
+ reiser4_large_times_stat *sd_lt;
+
+ sd_lt = (reiser4_large_times_stat *) * area;
+
+ inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
+ inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
+ inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
+
+ move_on(len, area, sizeof *sd_lt);
+ return 0;
+ } else
+ return not_enough_space(inode, "large times sd");
+}
+
+static int
+save_len_large_times_sd(struct inode *inode UNUSED_ARG
+ /* object being processed */ )
+{
+ return sizeof(reiser4_large_times_stat);
+}
+
+static int
+save_large_times_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ )
+{
+ reiser4_large_times_stat *sd;
+
+ assert("nikita-2817", inode != NULL);
+ assert("nikita-2818", area != NULL);
+ assert("nikita-2819", *area != NULL);
+
+ sd = (reiser4_large_times_stat *) * area;
+
+ put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
+ put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
+ put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
+
+ *area += sizeof *sd;
+ return 0;
+}
+
+/* symlink stat data extension */
+
+/* allocate memory for symlink target and attach it to inode->i_private */
+static int
+symlink_target_to_inode(struct inode *inode, const char *target, int len)
+{
+ assert("vs-845", inode->i_private == NULL);
+ assert("vs-846", !reiser4_inode_get_flag(inode,
+ REISER4_GENERIC_PTR_USED));
+ /* FIXME-VS: this is prone to deadlock. Not more than other similar
+ places, though */
+ inode->i_private = kmalloc((size_t) len + 1,
+ reiser4_ctx_gfp_mask_get());
+ if (!inode->i_private)
+ return RETERR(-ENOMEM);
+
+ memcpy((char *)(inode->i_private), target, (size_t) len);
+ ((char *)(inode->i_private))[len] = 0;
+ reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
+ return 0;
+}
+
+/* this is called on read_inode. There is nothing to do actually, but some
+ sanity checks */
+static int present_symlink_sd(struct inode *inode, char **area, int *len)
+{
+ int result;
+ int length;
+ reiser4_symlink_stat *sd;
+
+ length = (int)inode->i_size;
+ /*
+ * *len is number of bytes in stat data item from *area to the end of
+ * item. It must be not less than size of symlink + 1 for ending 0
+ */
+ if (length > *len)
+ return not_enough_space(inode, "symlink");
+
+ if (*(*area + length) != 0) {
+ warning("vs-840", "Symlink is not zero terminated");
+ return RETERR(-EIO);
+ }
+
+ sd = (reiser4_symlink_stat *) * area;
+ result = symlink_target_to_inode(inode, sd->body, length);
+
+ move_on(len, area, length + 1);
+ return result;
+}
+
+static int save_len_symlink_sd(struct inode *inode)
+{
+ return inode->i_size + 1;
+}
+
+/* this is called on create and update stat data. Do nothing on update but
+ update @area */
+static int save_symlink_sd(struct inode *inode, char **area)
+{
+ int result;
+ int length;
+ reiser4_symlink_stat *sd;
+
+ length = (int)inode->i_size;
+ /* inode->i_size must be set already */
+ assert("vs-841", length);
+
+ result = 0;
+ sd = (reiser4_symlink_stat *) * area;
+ if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
+ const char *target;
+
+ target = (const char *)(inode->i_private);
+ inode->i_private = NULL;
+
+ result = symlink_target_to_inode(inode, target, length);
+
+ /* copy symlink to stat data */
+ memcpy(sd->body, target, (size_t) length);
+ (*area)[length] = 0;
+ } else {
+ /* there is nothing to do in update but move area */
+ assert("vs-844",
+ !memcmp(inode->i_private, sd->body,
+ (size_t) length + 1));
+ }
+
+ *area += (length + 1);
+ return result;
+}
+
+static int present_flags_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ ,
+ int *len /* remaining length */ )
+{
+ assert("nikita-645", inode != NULL);
+ assert("nikita-646", area != NULL);
+ assert("nikita-647", *area != NULL);
+ assert("nikita-648", len != NULL);
+ assert("nikita-649", *len > 0);
+
+ if (*len >= (int)sizeof(reiser4_flags_stat)) {
+ reiser4_flags_stat *sd;
+
+ sd = (reiser4_flags_stat *) * area;
+ inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
+ move_on(len, area, sizeof *sd);
+ return 0;
+ } else
+ return not_enough_space(inode, "generation and attrs");
+}
+
+/* Audited by: green(2002.06.14) */
+static int save_len_flags_sd(struct inode *inode UNUSED_ARG /* object being
+ * processed */ )
+{
+ return sizeof(reiser4_flags_stat);
+}
+
+static int save_flags_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ )
+{
+ reiser4_flags_stat *sd;
+
+ assert("nikita-650", inode != NULL);
+ assert("nikita-651", area != NULL);
+ assert("nikita-652", *area != NULL);
+
+ sd = (reiser4_flags_stat *) * area;
+ put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
+ *area += sizeof *sd;
+ return 0;
+}
+
+static int absent_plugin_sd(struct inode *inode);
+static int present_plugin_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */ ,
+ int *len /* remaining length */,
+ int is_pset /* 1 if plugin set, 0 if heir set. */)
+{
+ reiser4_plugin_stat *sd;
+ reiser4_plugin *plugin;
+ reiser4_inode *info;
+ int i;
+ __u16 mask;
+ int result;
+ int num_of_plugins;
+
+ assert("nikita-653", inode != NULL);
+ assert("nikita-654", area != NULL);
+ assert("nikita-655", *area != NULL);
+ assert("nikita-656", len != NULL);
+ assert("nikita-657", *len > 0);
+
+ if (*len < (int)sizeof(reiser4_plugin_stat))
+ return not_enough_space(inode, "plugin");
+
+ sd = (reiser4_plugin_stat *) * area;
+ info = reiser4_inode_data(inode);
+
+ mask = 0;
+ num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
+ move_on(len, area, sizeof *sd);
+ result = 0;
+ for (i = 0; i < num_of_plugins; ++i) {
+ reiser4_plugin_slot *slot;
+ reiser4_plugin_type type;
+ pset_member memb;
+
+ slot = (reiser4_plugin_slot *) * area;
+ if (*len < (int)sizeof *slot)
+ return not_enough_space(inode, "additional plugin");
+
+ memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
+ type = aset_member_to_type_unsafe(memb);
+
+ if (type == REISER4_PLUGIN_TYPES) {
+ warning("nikita-3502",
+ "wrong %s member (%i) for %llu", is_pset ?
+ "pset" : "hset", memb,
+ (unsigned long long)get_inode_oid(inode));
+ return RETERR(-EINVAL);
+ }
+ plugin = plugin_by_disk_id(type, &slot->id);
+ if (plugin == NULL)
+ return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
+
+ /* plugin is loaded into inode, mark this into inode's
+ bitmask of loaded non-standard plugins */
+ if (!(mask & (1 << memb))) {
+ mask |= (1 << memb);
+ } else {
+ warning("nikita-658", "duplicate plugin for %llu",
+ (unsigned long long)get_inode_oid(inode));
+ return RETERR(-EINVAL);
+ }
+ move_on(len, area, sizeof *slot);
+ /* load plugin data, if any */
+ if (plugin->h.pops != NULL && plugin->h.pops->load)
+ result = plugin->h.pops->load(inode, plugin, area, len);
+ else
+ result = aset_set_unsafe(is_pset ? &info->pset :
+ &info->hset, memb, plugin);
+ if (result)
+ return result;
+ }
+ if (is_pset) {
+ /* if object plugin wasn't loaded from stat-data, guess it by
+ mode bits */
+ plugin = file_plugin_to_plugin(inode_file_plugin(inode));
+ if (plugin == NULL)
+ result = absent_plugin_sd(inode);
+ info->plugin_mask = mask;
+ } else
+ info->heir_mask = mask;
+
+ return result;
+}
+
+static int present_pset_sd(struct inode *inode, char **area, int *len) {
+ return present_plugin_sd(inode, area, len, 1 /* pset */);
+}
+
+/* Determine object plugin for @inode based on i_mode.
+
+ Many objects in reiser4 file system are controlled by standard object
+ plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
+
+ For such files we don't explicitly store plugin id in object stat
+ data. Rather required plugin is guessed from mode bits, where file "type"
+ is encoded (see stat(2)).
+*/
+static int
+guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
+{
+ int fplug_id;
+ int dplug_id;
+ reiser4_inode *info;
+
+ assert("nikita-736", inode != NULL);
+
+ dplug_id = fplug_id = -1;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFSOCK:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ fplug_id = SPECIAL_FILE_PLUGIN_ID;
+ break;
+ case S_IFLNK:
+ fplug_id = SYMLINK_FILE_PLUGIN_ID;
+ break;
+ case S_IFDIR:
+ fplug_id = DIRECTORY_FILE_PLUGIN_ID;
+ dplug_id = HASHED_DIR_PLUGIN_ID;
+ break;
+ default:
+ warning("nikita-737", "wrong file mode: %o", inode->i_mode);
+ return RETERR(-EIO);
+ case S_IFREG:
+ fplug_id = UNIX_FILE_PLUGIN_ID;
+ break;
+ }
+ info = reiser4_inode_data(inode);
+ set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
+ plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
+ set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
+ plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
+ return 0;
+}
+
+/* Audited by: green(2002.06.14) */
+static int absent_plugin_sd(struct inode *inode /* object being processed */ )
+{
+ int result;
+
+ assert("nikita-659", inode != NULL);
+
+ result = guess_plugin_by_mode(inode);
+ /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
+ but setup_inode_ops() will call make_bad_inode().
+ Another, more logical but bit more complex solution is to add
+ "bad-file plugin". */
+ /* FIXME-VS: activate was called here */
+ return result;
+}
+
+/* helper function for plugin_sd_save_len(): calculate how much space
+ required to save state of given plugin */
+/* Audited by: green(2002.06.14) */
+static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
+ struct inode *inode /* object being processed */ ,
+ pset_member memb,
+ int len, int is_pset)
+{
+ reiser4_inode *info;
+ assert("nikita-661", inode != NULL);
+
+ if (plugin == NULL)
+ return len;
+
+ info = reiser4_inode_data(inode);
+ if (is_pset ?
+ info->plugin_mask & (1 << memb) :
+ info->heir_mask & (1 << memb)) {
+ len += sizeof(reiser4_plugin_slot);
+ if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
+ /*
+ * non-standard plugin, call method
+ * commented as it is incompatible with alignment
+ * policy in save_plug() -edward
+ *
+ * len = reiser4_round_up(len,
+ * plugin->h.pops->alignment);
+ */
+ len += plugin->h.pops->save_len(inode, plugin);
+ }
+ }
+ return len;
+}
+
+/* calculate how much space is required to save state of all plugins,
+ associated with inode */
+static int save_len_plugin_sd(struct inode *inode /* object being processed */,
+ int is_pset)
+{
+ int len;
+ int last;
+ reiser4_inode *state;
+ pset_member memb;
+
+ assert("nikita-663", inode != NULL);
+
+ state = reiser4_inode_data(inode);
+
+ /* common case: no non-standard plugins */
+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
+ return 0;
+ len = sizeof(reiser4_plugin_stat);
+ last = PSET_LAST;
+
+ for (memb = 0; memb < last; ++memb) {
+ len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
+ inode, memb, len, is_pset);
+ }
+ assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
+ return len;
+}
+
+static int save_len_pset_sd(struct inode *inode) {
+ return save_len_plugin_sd(inode, 1 /* pset */);
+}
+
+/* helper function for plugin_sd_save(): save plugin, associated with
+ inode. */
+static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
+ struct inode *inode /* object being processed */ ,
+ int memb /* what element of pset is saved */ ,
+ char **area /* position in stat-data */ ,
+ int *count /* incremented if plugin were actually saved. */,
+ int is_pset /* 1 for plugin set, 0 for heir set */)
+{
+ reiser4_plugin_slot *slot;
+ int fake_len;
+ int result;
+
+ assert("nikita-665", inode != NULL);
+ assert("nikita-666", area != NULL);
+ assert("nikita-667", *area != NULL);
+
+ if (plugin == NULL)
+ return 0;
+
+ if (is_pset ?
+ !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
+ !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
+ return 0;
+ slot = (reiser4_plugin_slot *) * area;
+ put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
+ put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
+ fake_len = (int)0xffff;
+ move_on(&fake_len, area, sizeof *slot);
+ ++*count;
+ result = 0;
+ if (plugin->h.pops != NULL) {
+ if (plugin->h.pops->save != NULL)
+ result = plugin->h.pops->save(inode, plugin, area);
+ }
+ return result;
+}
+
+/* save state of all non-standard plugins associated with inode */
+static int save_plugin_sd(struct inode *inode /* object being processed */ ,
+ char **area /* position in stat-data */,
+ int is_pset /* 1 for pset, 0 for hset */)
+{
+ int fake_len;
+ int result = 0;
+ int num_of_plugins;
+ reiser4_plugin_stat *sd;
+ reiser4_inode *state;
+ pset_member memb;
+
+ assert("nikita-669", inode != NULL);
+ assert("nikita-670", area != NULL);
+ assert("nikita-671", *area != NULL);
+
+ state = reiser4_inode_data(inode);
+ if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
+ return 0;
+ sd = (reiser4_plugin_stat *) * area;
+ fake_len = (int)0xffff;
+ move_on(&fake_len, area, sizeof *sd);
+
+ num_of_plugins = 0;
+ for (memb = 0; memb < PSET_LAST; ++memb) {
+ result = save_plug(aset_get(is_pset ? state->pset : state->hset,
+ memb),
+ inode, memb, area, &num_of_plugins, is_pset);
+ if (result != 0)
+ break;
+ }
+
+ put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
+ return result;
+}
+
+static int save_pset_sd(struct inode *inode, char **area) {
+ return save_plugin_sd(inode, area, 1 /* pset */);
+}
+
+static int present_hset_sd(struct inode *inode, char **area, int *len) {
+ return present_plugin_sd(inode, area, len, 0 /* hset */);
+}
+
+static int save_len_hset_sd(struct inode *inode) {
+ return save_len_plugin_sd(inode, 0 /* pset */);
+}
+
+static int save_hset_sd(struct inode *inode, char **area) {
+ return save_plugin_sd(inode, area, 0 /* hset */);
+}
+
+/* helper function for crypto_sd_present(), crypto_sd_save.
+ Extract crypto info from stat-data and attach it to inode */
+static int extract_crypto_info (struct inode * inode,
+ reiser4_crypto_stat * sd)
+{
+ struct reiser4_crypto_info * info;
+ assert("edward-11", !inode_crypto_info(inode));
+ assert("edward-1413",
+ !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
+ /* create and attach a crypto-stat without secret key loaded */
+ info = reiser4_alloc_crypto_info(inode);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
+ memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
+ reiser4_attach_crypto_info(inode, info);
+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
+ return 0;
+}
+
+/* crypto stat-data extension */
+
+static int present_crypto_sd(struct inode *inode, char **area, int *len)
+{
+ int result;
+ reiser4_crypto_stat *sd;
+ digest_plugin *dplug = inode_digest_plugin(inode);
+
+ assert("edward-06", dplug != NULL);
+ assert("edward-684", dplug->fipsize);
+ assert("edward-07", area != NULL);
+ assert("edward-08", *area != NULL);
+ assert("edward-09", len != NULL);
+ assert("edward-10", *len > 0);
+
+ if (*len < (int)sizeof(reiser4_crypto_stat)) {
+ return not_enough_space(inode, "crypto-sd");
+ }
+ /* *len is number of bytes in stat data item from *area to the end of
+ item. It must be not less than size of this extension */
+ assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
+
+ sd = (reiser4_crypto_stat *) * area;
+ result = extract_crypto_info(inode, sd);
+ move_on(len, area, sizeof(*sd) + dplug->fipsize);
+
+ return result;
+}
+
+static int save_len_crypto_sd(struct inode *inode)
+{
+ return sizeof(reiser4_crypto_stat) +
+ inode_digest_plugin(inode)->fipsize;
+}
+
+static int save_crypto_sd(struct inode *inode, char **area)
+{
+ int result = 0;
+ reiser4_crypto_stat *sd;
+ struct reiser4_crypto_info * info = inode_crypto_info(inode);
+ digest_plugin *dplug = inode_digest_plugin(inode);
+
+ assert("edward-12", dplug != NULL);
+ assert("edward-13", area != NULL);
+ assert("edward-14", *area != NULL);
+ assert("edward-15", info != NULL);
+ assert("edward-1414", info->keyid != NULL);
+ assert("edward-1415", info->keysize != 0);
+ assert("edward-76", reiser4_inode_data(inode) != NULL);
+
+ if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
+ /* file is just created */
+ sd = (reiser4_crypto_stat *) *area;
+ /* copy everything but private key to the disk stat-data */
+ put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
+ memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
+ reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
+ }
+ *area += (sizeof(*sd) + dplug->fipsize);
+ return result;
+}
+
+static int eio(struct inode *inode, char **area, int *len)
+{
+ return RETERR(-EIO);
+}
+
+sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
+ [LIGHT_WEIGHT_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = LIGHT_WEIGHT_STAT,
+ .pops = NULL,
+ .label = "light-weight sd",
+ .desc = "sd for light-weight files",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_lw_sd,
+ .absent = NULL,
+ .save_len = save_len_lw_sd,
+ .save = save_lw_sd,
+ .alignment = 8
+ },
+ [UNIX_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = UNIX_STAT,
+ .pops = NULL,
+ .label = "unix-sd",
+ .desc = "unix stat-data fields",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_unix_sd,
+ .absent = absent_unix_sd,
+ .save_len = save_len_unix_sd,
+ .save = save_unix_sd,
+ .alignment = 8
+ },
+ [LARGE_TIMES_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = LARGE_TIMES_STAT,
+ .pops = NULL,
+ .label = "64time-sd",
+ .desc = "nanosecond resolution for times",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_large_times_sd,
+ .absent = NULL,
+ .save_len = save_len_large_times_sd,
+ .save = save_large_times_sd,
+ .alignment = 8
+ },
+ [SYMLINK_STAT] = {
+ /* stat data of symlink has this extension */
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = SYMLINK_STAT,
+ .pops = NULL,
+ .label = "symlink-sd",
+ .desc =
+ "stat data is appended with symlink name",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_symlink_sd,
+ .absent = NULL,
+ .save_len = save_len_symlink_sd,
+ .save = save_symlink_sd,
+ .alignment = 8
+ },
+ [PLUGIN_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = PLUGIN_STAT,
+ .pops = NULL,
+ .label = "plugin-sd",
+ .desc = "plugin stat-data fields",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_pset_sd,
+ .absent = absent_plugin_sd,
+ .save_len = save_len_pset_sd,
+ .save = save_pset_sd,
+ .alignment = 8
+ },
+ [HEIR_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = HEIR_STAT,
+ .pops = NULL,
+ .label = "heir-plugin-sd",
+ .desc = "heir plugin stat-data fields",
+ .linkage = {NULL,NULL}
+ },
+ .present = present_hset_sd,
+ .absent = NULL,
+ .save_len = save_len_hset_sd,
+ .save = save_hset_sd,
+ .alignment = 8
+ },
+ [FLAGS_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = FLAGS_STAT,
+ .pops = NULL,
+ .label = "flags-sd",
+ .desc = "inode bit flags",
+ .linkage = {NULL, NULL}
+ },
+ .present = present_flags_sd,
+ .absent = NULL,
+ .save_len = save_len_flags_sd,
+ .save = save_flags_sd,
+ .alignment = 8
+ },
+ [CAPABILITIES_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = CAPABILITIES_STAT,
+ .pops = NULL,
+ .label = "capabilities-sd",
+ .desc = "capabilities",
+ .linkage = {NULL, NULL}
+ },
+ .present = eio,
+ .absent = NULL,
+ .save_len = save_len_flags_sd,
+ .save = save_flags_sd,
+ .alignment = 8
+ },
+ [CRYPTO_STAT] = {
+ .h = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .id = CRYPTO_STAT,
+ .pops = NULL,
+ .label = "crypto-sd",
+ .desc = "secret key size and id",
+ .linkage = {NULL, NULL}
+ },
+ .present = present_crypto_sd,
+ .absent = NULL,
+ .save_len = save_len_crypto_sd,
+ .save = save_crypto_sd,
+ .alignment = 8
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/static_stat.h linux-5.10.2/fs/reiser4/plugin/item/static_stat.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/static_stat.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/static_stat.h 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,224 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
+
+In the case where each file has not less than the fields needed by the
+stat() syscall, it is more compact to store those fields in this
+struct.
+
+If this item does not exist, then all stats are dynamically resolved.
+At the moment, we either resolve all stats dynamically or all of them
+statically. If you think this is not fully optimal, and the rest of
+reiser4 is working, then fix it...:-)
+
+*/
+
+#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
+#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+
+#include <linux/fs.h> /* for struct inode */
+
+/* Stat data layout: goals and implementation.
+
+ We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
+ them, including not having semantic metadata attached to them.
+
+ There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
+ want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
+ sized structure because the statically sized structure knows without recording it what the names and lengths of the
+ attributes are.
+
+ This leads to a natural compromise, which is to special case those files which have simply the standard unix file
+ attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
+ file in their use of file attributes.
+
+ Yet this compromise deserves to be compromised a little.
+
+ We accommodate the case where you have no more than the standard unix file attributes by using an "extension
+ bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
+
+ If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
+ from parent directory (as uid, gid) or initialised to some sane values.
+
+ To capitalize on existing code infrastructure, extensions are
+ implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
+ Each stat-data extension plugin implements four methods:
+
+ ->present() called by sd_load() when this extension is found in stat-data
+ ->absent() called by sd_load() when this extension is not found in stat-data
+ ->save_len() called by sd_len() to calculate total length of stat-data
+ ->save() called by sd_save() to store extension data into stat-data
+
+ Implementation is in fs/reiser4/plugin/item/static_stat.c
+*/
+
+/* stat-data extension. Please order this by presumed frequency of use */
+typedef enum {
+ /* support for light-weight files */
+ LIGHT_WEIGHT_STAT,
+ /* data required to implement unix stat(2) call. Layout is in
+ reiser4_unix_stat. If this is not present, file is light-weight */
+ UNIX_STAT,
+ /* this contains additional set of 32bit [anc]time fields to implement
+ nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
+ if this extension is governed by 32bittimes mount option. */
+ LARGE_TIMES_STAT,
+ /* stat data has link name included */
+ SYMLINK_STAT,
+ /* on-disk slots of non-standard plugins for main plugin table
+ (@reiser4_inode->pset), that is, plugins that cannot be deduced
+ from file mode bits), for example, aggregation, interpolation etc. */
+ PLUGIN_STAT,
+ /* this extension contains persistent inode flags. These flags are
+ single bits: immutable, append, only, etc. Layout is in
+ reiser4_flags_stat. */
+ FLAGS_STAT,
+ /* this extension contains capabilities sets, associated with this
+ file. Layout is in reiser4_capabilities_stat */
+ CAPABILITIES_STAT,
+ /* this extension contains size and public id of the secret key.
+ Layout is in reiser4_crypto_stat */
+ CRYPTO_STAT,
+ /* on-disk slots of non-default plugins for inheritance, which
+ are extracted to special plugin table (@reiser4_inode->hset).
+ By default, children of the object will inherit plugins from
+ its main plugin table (pset). */
+ HEIR_STAT,
+ LAST_SD_EXTENSION,
+ /*
+ * init_inode_static_sd() iterates over extension mask until all
+ * non-zero bits are processed. This means, that neither ->present(),
+ * nor ->absent() methods will be called for stat-data extensions that
+ * go after last present extension. But some basic extensions, we want
+ * either ->absent() or ->present() method to be called, because these
+ * extensions set up something in inode even when they are not
+ * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
+ * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
+ * ->present(), or ->absent() method will be called, independently of
+ * what other extensions are present.
+ */
+ LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
+} sd_ext_bits;
+
+/* minimal stat-data. This allows to support light-weight files. */
+typedef struct reiser4_stat_data_base {
+ /* 0 */ __le16 extmask;
+ /* 2 */
+} PACKED reiser4_stat_data_base;
+
+typedef struct reiser4_light_weight_stat {
+ /* 0 */ __le16 mode;
+ /* 2 */ __le32 nlink;
+ /* 6 */ __le64 size;
+ /* size in bytes */
+ /* 14 */
+} PACKED reiser4_light_weight_stat;
+
+typedef struct reiser4_unix_stat {
+ /* owner id */
+ /* 0 */ __le32 uid;
+ /* group id */
+ /* 4 */ __le32 gid;
+ /* access time */
+ /* 8 */ __le32 atime;
+ /* modification time */
+ /* 12 */ __le32 mtime;
+ /* change time */
+ /* 16 */ __le32 ctime;
+ union {
+ /* minor:major for device files */
+ /* 20 */ __le64 rdev;
+ /* bytes used by file */
+ /* 20 */ __le64 bytes;
+ } u;
+ /* 28 */
+} PACKED reiser4_unix_stat;
+
+/* symlink stored as part of inode */
+typedef struct reiser4_symlink_stat {
+ char body[0];
+} PACKED reiser4_symlink_stat;
+
+typedef struct reiser4_plugin_slot {
+ /* 0 */ __le16 pset_memb;
+ /* 2 */ __le16 id;
+ /* 4 *//* here plugin stores its persistent state */
+} PACKED reiser4_plugin_slot;
+
+/* stat-data extension for files with non-standard plugin. */
+typedef struct reiser4_plugin_stat {
+ /* number of additional plugins, associated with this object */
+ /* 0 */ __le16 plugins_no;
+ /* 2 */ reiser4_plugin_slot slot[0];
+ /* 2 */
+} PACKED reiser4_plugin_stat;
+
+/* stat-data extension for inode flags. Currently it is just fixed-width 32
+ * bit mask. If need arise, this can be replaced with variable width
+ * bitmask. */
+typedef struct reiser4_flags_stat {
+ /* 0 */ __le32 flags;
+ /* 4 */
+} PACKED reiser4_flags_stat;
+
+typedef struct reiser4_capabilities_stat {
+ /* 0 */ __le32 effective;
+ /* 8 */ __le32 permitted;
+ /* 16 */
+} PACKED reiser4_capabilities_stat;
+
+typedef struct reiser4_cluster_stat {
+/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
+ /* 0 */ d8 cluster_shift;
+ /* 1 */
+} PACKED reiser4_cluster_stat;
+
+typedef struct reiser4_crypto_stat {
+ /* secret key size, bits */
+ /* 0 */ d16 keysize;
+ /* secret key id */
+ /* 2 */ d8 keyid[0];
+ /* 2 */
+} PACKED reiser4_crypto_stat;
+
+typedef struct reiser4_large_times_stat {
+ /* access time */
+ /* 0 */ d32 atime;
+ /* modification time */
+ /* 4 */ d32 mtime;
+ /* change time */
+ /* 8 */ d32 ctime;
+ /* 12 */
+} PACKED reiser4_large_times_stat;
+
+/* this structure is filled by sd_item_stat */
+typedef struct sd_stat {
+ int dirs;
+ int files;
+ int others;
+} sd_stat;
+
+/* plugin->item.common.* */
+extern void print_sd(const char *prefix, coord_t * coord);
+extern void item_stat_static_sd(const coord_t * coord, void *vp);
+
+/* plugin->item.s.sd.* */
+extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
+extern int save_len_static_sd(struct inode *inode);
+extern int save_static_sd(struct inode *inode, char **area);
+
+/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/tail.c linux-5.10.2/fs/reiser4/plugin/item/tail.c
--- linux-5.10.2.orig/fs/reiser4/plugin/item/tail.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/tail.c 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,797 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "item.h"
+#include "../../inode.h"
+#include "../../page_cache.h"
+#include "../../carry.h"
+#include "../../vfs_ops.h"
+
+#include <asm/uaccess.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+
+/* plugin->u.item.b.max_key_inside */
+reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
+{
+ item_key_by_coord(coord, key);
+ set_key_offset(key, get_key_offset(reiser4_max_key()));
+ return key;
+}
+
+/* plugin->u.item.b.can_contain_key */
+int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
+ const reiser4_item_data *data)
+{
+ reiser4_key item_key;
+
+ if (item_plugin_by_coord(coord) != data->iplug)
+ return 0;
+
+ item_key_by_coord(coord, &item_key);
+ if (get_key_locality(key) != get_key_locality(&item_key) ||
+ get_key_objectid(key) != get_key_objectid(&item_key))
+ return 0;
+
+ return 1;
+}
+
+/* plugin->u.item.b.mergeable
+ first item is of tail type */
+/* Audited by: green(2002.06.14) */
+int mergeable_tail(const coord_t *p1, const coord_t *p2)
+{
+ reiser4_key key1, key2;
+
+ assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
+ FILE_BODY_ITEM_TYPE));
+ assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
+
+ if (item_id_by_coord(p2) != FORMATTING_ID) {
+ /* second item is of another type */
+ return 0;
+ }
+
+ item_key_by_coord(p1, &key1);
+ item_key_by_coord(p2, &key2);
+ if (get_key_locality(&key1) != get_key_locality(&key2) ||
+ get_key_objectid(&key1) != get_key_objectid(&key2)
+ || get_key_type(&key1) != get_key_type(&key2)) {
+ /* items of different objects */
+ return 0;
+ }
+ if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
+ /* not adjacent items */
+ return 0;
+ }
+ return 1;
+}
+
+/* plugin->u.item.b.print
+ plugin->u.item.b.check */
+
+/* plugin->u.item.b.nr_units */
+pos_in_node_t nr_units_tail(const coord_t * coord)
+{
+ return item_length_by_coord(coord);
+}
+
+/* plugin->u.item.b.lookup */
+lookup_result
+lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
+{
+ reiser4_key item_key;
+ __u64 lookuped, offset;
+ unsigned nr_units;
+
+ item_key_by_coord(coord, &item_key);
+ offset = get_key_offset(item_key_by_coord(coord, &item_key));
+ nr_units = nr_units_tail(coord);
+
+ /* key we are looking for must be greater than key of item @coord */
+ assert("vs-416", keygt(key, &item_key));
+
+ /* offset we are looking for */
+ lookuped = get_key_offset(key);
+
+ if (lookuped >= offset && lookuped < offset + nr_units) {
+ /* byte we are looking for is in this item */
+ coord->unit_pos = lookuped - offset;
+ coord->between = AT_UNIT;
+ return CBK_COORD_FOUND;
+ }
+
+ /* set coord after last unit */
+ coord->unit_pos = nr_units - 1;
+ coord->between = AFTER_UNIT;
+ return bias ==
+ FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
+}
+
+/* plugin->u.item.b.paste */
+int
+paste_tail(coord_t *coord, reiser4_item_data *data,
+ carry_plugin_info *info UNUSED_ARG)
+{
+ unsigned old_item_length;
+ char *item;
+
+ /* length the item had before resizing has been performed */
+ old_item_length = item_length_by_coord(coord) - data->length;
+
+ /* tail items never get pasted in the middle */
+ assert("vs-363",
+ (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
+ (coord->unit_pos == old_item_length - 1 &&
+ coord->between == AFTER_UNIT) ||
+ (coord->unit_pos == 0 && old_item_length == 0
+ && coord->between == AT_UNIT));
+
+ item = item_body_by_coord(coord);
+ if (coord->unit_pos == 0)
+ /* make space for pasted data when pasting at the beginning of
+ the item */
+ memmove(item + data->length, item, old_item_length);
+
+ if (coord->between == AFTER_UNIT)
+ coord->unit_pos++;
+
+ if (data->data) {
+ assert("vs-554", data->user == 0 || data->user == 1);
+ if (data->user) {
+ assert("nikita-3035", reiser4_schedulable());
+ /* copy from user space */
+ if (__copy_from_user(item + coord->unit_pos,
+ (const char __user *)data->data,
+ (unsigned)data->length))
+ return RETERR(-EFAULT);
+ } else
+ /* copy from kernel space */
+ memcpy(item + coord->unit_pos, data->data,
+ (unsigned)data->length);
+ } else {
+ memset(item + coord->unit_pos, 0, (unsigned)data->length);
+ }
+ return 0;
+}
+
+/* plugin->u.item.b.fast_paste */
+
+/* plugin->u.item.b.can_shift
+ number of units is returned via return value, number of bytes via @size. For
+ tail items they coincide */
+int
+can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
+ znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
+ unsigned *size, unsigned want)
+{
+ /* make sure that that we do not want to shift more than we have */
+ assert("vs-364", want > 0
+ && want <= (unsigned)item_length_by_coord(source));
+
+ *size = min(want, free_space);
+ return *size;
+}
+
+/* plugin->u.item.b.copy_units */
+void
+copy_units_tail(coord_t * target, coord_t * source,
+ unsigned from, unsigned count,
+ shift_direction where_is_free_space,
+ unsigned free_space UNUSED_ARG)
+{
+ /* make sure that item @target is expanded already */
+ assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
+ assert("vs-370", free_space >= count);
+
+ if (where_is_free_space == SHIFT_LEFT) {
+ /* append item @target with @count first bytes of @source */
+ assert("vs-365", from == 0);
+
+ memcpy((char *)item_body_by_coord(target) +
+ item_length_by_coord(target) - count,
+ (char *)item_body_by_coord(source), count);
+ } else {
+ /* target item is moved to right already */
+ reiser4_key key;
+
+ assert("vs-367",
+ (unsigned)item_length_by_coord(source) == from + count);
+
+ memcpy((char *)item_body_by_coord(target),
+ (char *)item_body_by_coord(source) + from, count);
+
+ /* new units are inserted before first unit in an item,
+ therefore, we have to update item key */
+ item_key_by_coord(source, &key);
+ set_key_offset(&key, get_key_offset(&key) + from);
+
+ node_plugin_by_node(target->node)->update_item_key(target, &key,
+ NULL /*info */);
+ }
+}
+
+/* plugin->u.item.b.create_hook */
+
+/* item_plugin->b.kill_hook
+ this is called when @count units starting from @from-th one are going to be removed
+ */
+int
+kill_hook_tail(const coord_t * coord, pos_in_node_t from,
+ pos_in_node_t count, struct carry_kill_data *kdata)
+{
+ reiser4_key key;
+ loff_t start, end;
+
+ assert("vs-1577", kdata);
+ assert("vs-1579", kdata->inode);
+
+ item_key_by_coord(coord, &key);
+ start = get_key_offset(&key) + from;
+ end = start + count;
+ fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
+ return 0;
+}
+
+/* plugin->u.item.b.shift_hook */
+
+/* helper for kill_units_tail and cut_units_tail */
+static int
+do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ reiser4_key * smallest_removed, reiser4_key * new_first)
+{
+ pos_in_node_t count;
+
+ /* this method is only called to remove part of item */
+ assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
+ /* tails items are never cut from the middle of an item */
+ assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
+ assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
+
+ count = to - from + 1;
+
+ if (smallest_removed) {
+ /* store smallest key removed */
+ item_key_by_coord(coord, smallest_removed);
+ set_key_offset(smallest_removed,
+ get_key_offset(smallest_removed) + from);
+ }
+ if (new_first) {
+ /* head of item is cut */
+ assert("vs-1529", from == 0);
+
+ item_key_by_coord(coord, new_first);
+ set_key_offset(new_first,
+ get_key_offset(new_first) + from + count);
+ }
+
+ if (REISER4_DEBUG)
+ memset((char *)item_body_by_coord(coord) + from, 0, count);
+ return count;
+}
+
+/* plugin->u.item.b.cut_units */
+int
+cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *cdata UNUSED_ARG,
+ reiser4_key * smallest_removed, reiser4_key * new_first)
+{
+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
+}
+
+/* plugin->u.item.b.kill_units */
+int
+kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *kdata, reiser4_key * smallest_removed,
+ reiser4_key * new_first)
+{
+ kill_hook_tail(coord, from, to - from + 1, kdata);
+ return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
+}
+
+/* plugin->u.item.b.unit_key */
+reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
+{
+ assert("vs-375", coord_is_existing_unit(coord));
+
+ item_key_by_coord(coord, key);
+ set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
+
+ return key;
+}
+
+/* plugin->u.item.b.estimate
+ plugin->u.item.b.item_data_by_flow */
+
+/* tail redpage function. It is called from readpage_tail(). */
+static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
+{
+ tap_t tap;
+ int result;
+ coord_t coord;
+ lock_handle lh;
+ int count, mapped;
+ struct inode *inode;
+ char *pagedata;
+
+ /* saving passed coord in order to do not move it by tap. */
+ init_lh(&lh);
+ copy_lh(&lh, uf_coord->lh);
+ inode = page->mapping->host;
+ coord_dup(&coord, &uf_coord->coord);
+
+ reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
+
+ if ((result = reiser4_tap_load(&tap)))
+ goto out_tap_done;
+
+ /* lookup until page is filled up. */
+ for (mapped = 0; mapped < PAGE_SIZE; ) {
+ /* number of bytes to be copied to page */
+ count = item_length_by_coord(&coord) - coord.unit_pos;
+ if (count > PAGE_SIZE - mapped)
+ count = PAGE_SIZE - mapped;
+
+ /* attach @page to address space and get data address */
+ pagedata = kmap_atomic(page);
+
+ /* copy tail item to page */
+ memcpy(pagedata + mapped,
+ ((char *)item_body_by_coord(&coord) + coord.unit_pos),
+ count);
+ mapped += count;
+
+ flush_dcache_page(page);
+
+ /* dettach page from address space */
+ kunmap_atomic(pagedata);
+
+ /* Getting next tail item. */
+ if (mapped < PAGE_SIZE) {
+ /*
+ * unlock page in order to avoid keep it locked
+ * during tree lookup, which takes long term locks
+ */
+ unlock_page(page);
+
+ /* getting right neighbour. */
+ result = go_dir_el(&tap, RIGHT_SIDE, 0);
+
+ /* lock page back */
+ lock_page(page);
+ if (PageUptodate(page)) {
+ /*
+ * another thread read the page, we have
+ * nothing to do
+ */
+ result = 0;
+ goto out_unlock_page;
+ }
+
+ if (result) {
+ if (result == -E_NO_NEIGHBOR) {
+ /*
+ * rigth neighbor is not a formatted
+ * node
+ */
+ result = 0;
+ goto done;
+ } else {
+ goto out_tap_relse;
+ }
+ } else {
+ if (!inode_file_plugin(inode)->
+ owns_item(inode, &coord)) {
+ /* item of another file is found */
+ result = 0;
+ goto done;
+ }
+ }
+ }
+ }
+
+ done:
+ if (mapped != PAGE_SIZE)
+ zero_user_segment(page, mapped, PAGE_SIZE);
+ SetPageUptodate(page);
+ out_unlock_page:
+ unlock_page(page);
+ out_tap_relse:
+ reiser4_tap_relse(&tap);
+ out_tap_done:
+ reiser4_tap_done(&tap);
+ return result;
+}
+
+/**
+ * reiser4_read_dispatch->read_unix_file->page_cache_readahead->
+ * ->reiser4_readpage_dispatch->readpage_unix_file->readpage_tail_unix_file
+ * or
+ * filemap_fault->reiser4_readpage_dispatch->readpage_unix_file->
+ * ->readpage_tail_unix_file
+ *
+ * At the beginning: coord->node is read locked, zloaded, page is locked,
+ * coord is set to existing unit inside of tail item.
+ */
+int readpage_tail_unix_file(void *vp, struct page *page)
+{
+ uf_coord_t *uf_coord = vp;
+ ON_DEBUG(coord_t * coord = &uf_coord->coord);
+ ON_DEBUG(reiser4_key key);
+
+ assert("umka-2515", PageLocked(page));
+ assert("umka-2516", !PageUptodate(page));
+ assert("umka-2517", !jprivate(page) && !PagePrivate(page));
+ assert("umka-2518", page->mapping && page->mapping->host);
+
+ assert("umka-2519", znode_is_loaded(coord->node));
+ assert("umka-2520", item_is_tail(coord));
+ assert("umka-2521", coord_is_existing_unit(coord));
+ assert("umka-2522", znode_is_rlocked(coord->node));
+ assert("umka-2523",
+ page->mapping->host->i_ino ==
+ get_key_objectid(item_key_by_coord(coord, &key)));
+
+ return do_readpage_tail(uf_coord, page);
+}
+
+/**
+ * overwrite_tail
+ * @flow:
+ * @coord:
+ *
+ * Overwrites tail item or its part by user data. Returns number of bytes
+ * written or error code.
+ */
+static int overwrite_tail(flow_t *flow, coord_t *coord)
+{
+ unsigned count;
+
+ assert("vs-570", flow->user == 1);
+ assert("vs-946", flow->data);
+ assert("vs-947", coord_is_existing_unit(coord));
+ assert("vs-948", znode_is_write_locked(coord->node));
+ assert("nikita-3036", reiser4_schedulable());
+
+ count = item_length_by_coord(coord) - coord->unit_pos;
+ if (count > flow->length)
+ count = flow->length;
+
+ if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
+ (const char __user *)flow->data, count))
+ return RETERR(-EFAULT);
+
+ znode_make_dirty(coord->node);
+ return count;
+}
+
+/**
+ * insert_first_tail
+ * @inode:
+ * @flow:
+ * @coord:
+ * @lh:
+ *
+ * Returns number of bytes written or error code.
+ */
+static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
+ coord_t *coord, lock_handle *lh)
+{
+ int result;
+ loff_t to_write;
+ struct unix_file_info *uf_info;
+
+ if (get_key_offset(&flow->key) != 0) {
+ /*
+ * file is empty and we have to write not to the beginning of
+ * file. Create a hole at the beginning of file. On success
+ * insert_flow returns 0 as number of written bytes which is
+ * what we have to return on padding a file with holes
+ */
+ flow->data = NULL;
+ flow->length = get_key_offset(&flow->key);
+ set_key_offset(&flow->key, 0);
+ /*
+ * holes in files built of tails are stored just like if there
+ * were real data which are all zeros.
+ */
+ inode_add_bytes(inode, flow->length);
+ result = reiser4_insert_flow(coord, lh, flow);
+ if (flow->length)
+ inode_sub_bytes(inode, flow->length);
+
+ uf_info = unix_file_inode_data(inode);
+
+ /*
+ * first item insertion is only possible when writing to empty
+ * file or performing tail conversion
+ */
+ assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
+ (reiser4_inode_get_flag(inode,
+ REISER4_PART_MIXED) &&
+ reiser4_inode_get_flag(inode,
+ REISER4_PART_IN_CONV))));
+ /* if file was empty - update its state */
+ if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
+ uf_info->container = UF_CONTAINER_TAILS;
+ return result;
+ }
+
+ inode_add_bytes(inode, flow->length);
+
+ to_write = flow->length;
+ result = reiser4_insert_flow(coord, lh, flow);
+ if (flow->length)
+ inode_sub_bytes(inode, flow->length);
+ return (to_write - flow->length) ? (to_write - flow->length) : result;
+}
+
+/**
+ * append_tail
+ * @inode:
+ * @flow:
+ * @coord:
+ * @lh:
+ *
+ * Returns number of bytes written or error code.
+ */
+static ssize_t append_tail(struct inode *inode,
+ flow_t *flow, coord_t *coord, lock_handle *lh)
+{
+ int result;
+ reiser4_key akey;
+ loff_t to_write;
+
+ if (!keyeq(&flow->key, append_key_tail(coord, &akey))) {
+ flow->data = NULL;
+ flow->length = get_key_offset(&flow->key) - get_key_offset(&akey);
+ set_key_offset(&flow->key, get_key_offset(&akey));
+ /*
+ * holes in files built of tails are stored just like if there
+ * were real data which are all zeros.
+ */
+ inode_add_bytes(inode, flow->length);
+ result = reiser4_insert_flow(coord, lh, flow);
+ if (flow->length)
+ inode_sub_bytes(inode, flow->length);
+ return result;
+ }
+
+ inode_add_bytes(inode, flow->length);
+
+ to_write = flow->length;
+ result = reiser4_insert_flow(coord, lh, flow);
+ if (flow->length)
+ inode_sub_bytes(inode, flow->length);
+ return (to_write - flow->length) ? (to_write - flow->length) : result;
+}
+
+/**
+ * write_tail_reserve_space - reserve space for tail write operation
+ * @inode:
+ *
+ * Estimates and reserves space which may be required for writing one flow to a
+ * file
+ */
+static int write_tail_reserve_space(struct inode *inode)
+{
+ __u64 count;
+ reiser4_tree *tree;
+
+ /*
+ * to write one flow to a file by tails we have to reserve disk space for:
+
+ * 1. find_file_item may have to insert empty node to the tree (empty
+ * leaf node between two extent items). This requires 1 block and
+ * number of blocks which are necessary to perform insertion of an
+ * internal item into twig level.
+ *
+ * 2. flow insertion
+ *
+ * 3. stat data update
+ */
+ tree = meta_subvol_tree();
+ count = estimate_one_insert_item(tree) +
+ estimate_insert_flow(tree->height) +
+ estimate_one_insert_item(tree);
+ grab_space_enable();
+ return reiser4_grab_space(count, 0 , get_meta_subvol());
+}
+
+#define PAGE_PER_FLOW 4
+
+static loff_t faultin_user_pages(const char __user *buf, size_t count)
+{
+ loff_t faulted;
+ int to_fault;
+
+ if (count > PAGE_PER_FLOW * PAGE_SIZE)
+ count = PAGE_PER_FLOW * PAGE_SIZE;
+ faulted = 0;
+ while (count > 0) {
+ to_fault = PAGE_SIZE;
+ if (count < to_fault)
+ to_fault = count;
+ fault_in_pages_readable(buf + faulted, to_fault);
+ count -= to_fault;
+ faulted += to_fault;
+ }
+ return faulted;
+}
+
+ssize_t write_tail_noreserve(struct file *file,
+ struct inode * inode,
+ const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct hint hint;
+ int result;
+ flow_t flow;
+ coord_t *coord;
+ lock_handle *lh;
+ znode *loaded;
+
+ assert("edward-1548", inode != NULL);
+
+ result = load_file_hint(file, &hint);
+ BUG_ON(result != 0);
+
+ flow.length = faultin_user_pages(buf, count);
+ flow.user = 1;
+ memcpy(&flow.data, &buf, sizeof(buf));
+ flow.op = WRITE_OP;
+ inode_file_plugin(inode)->build_body_key(inode, *pos, &flow.key);
+ result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
+ if (IS_CBKERR(result))
+ return result;
+
+ coord = &hint.ext_coord.coord;
+ lh = hint.ext_coord.lh;
+
+ result = zload(coord->node);
+ BUG_ON(result != 0);
+ loaded = coord->node;
+
+ if (coord->between == AFTER_UNIT) {
+ /* append with data or hole */
+ result = append_tail(inode, &flow, coord, lh);
+ } else if (coord->between == AT_UNIT) {
+ /* overwrite */
+ result = overwrite_tail(&flow, coord);
+ } else {
+ /* no items of this file yet. insert data or hole */
+ result = insert_first_tail(inode, &flow, coord, lh);
+ }
+ zrelse(loaded);
+ if (result < 0) {
+ done_lh(lh);
+ return result;
+ }
+
+ /* seal and unlock znode */
+ hint.ext_coord.valid = 0;
+ if (hint.ext_coord.valid)
+ reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
+ else
+ reiser4_unset_hint(&hint);
+
+ save_file_hint(file, &hint);
+ return result;
+}
+
+/**
+ * @file: file to write to
+ * @buf: address of user-space buffer
+ * @count: number of bytes to write
+ * @pos: position in file to write to
+ *
+ * Returns number of written bytes or error code.
+ */
+ssize_t write_tail_unix_file(struct file *file, struct inode * inode,
+ const char __user *buf, size_t count, loff_t *pos)
+{
+ if (write_tail_reserve_space(inode))
+ return RETERR(-ENOSPC);
+ return write_tail_noreserve(file, inode, buf, count, pos);
+}
+
+#if REISER4_DEBUG
+static int coord_matches_key_tail(struct inode *inode,
+ const coord_t *coord, const reiser4_key *key)
+{
+ reiser4_key item_key;
+
+ assert("vs-1356", coord_is_existing_unit(coord));
+ assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
+ assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
+ return get_key_offset(key) ==
+ get_key_offset(&item_key) + coord->unit_pos;
+}
+#endif
+
+int read_tail_unix_file(struct file *file, flow_t *f, hint_t *hint)
+{
+ unsigned count;
+ int item_length;
+ coord_t *coord;
+ uf_coord_t *uf_coord;
+
+ uf_coord = &hint->ext_coord;
+ coord = &uf_coord->coord;
+
+ assert("vs-571", f->user == 1);
+ assert("vs-571", f->data);
+ assert("vs-967", coord && coord->node);
+ assert("vs-1117", znode_is_rlocked(coord->node));
+ assert("vs-1118", znode_is_loaded(coord->node));
+
+ assert("nikita-3037", reiser4_schedulable());
+ assert("vs-1357", coord_matches_key_tail(file_inode(file),
+ coord, &f->key));
+ /* calculate number of bytes to read off the item */
+ item_length = item_length_by_coord(coord);
+ count = item_length_by_coord(coord) - coord->unit_pos;
+ if (count > f->length)
+ count = f->length;
+
+ /* user page has to be brought in so that major page fault does not
+ * occur here when longtem lock is held */
+ if (__copy_to_user((char __user *)f->data,
+ ((char *)item_body_by_coord(coord) + coord->unit_pos),
+ count))
+ return RETERR(-EFAULT);
+
+ /* probably mark_page_accessed() should only be called if
+ * coord->unit_pos is zero. */
+ mark_page_accessed(znode_page(coord->node));
+ move_flow_forward(f, count);
+
+ coord->unit_pos += count;
+ if (item_length == coord->unit_pos) {
+ coord->unit_pos--;
+ coord->between = AFTER_UNIT;
+ }
+ reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
+ return 0;
+}
+
+reiser4_key *append_key_tail(const coord_t *coord, reiser4_key *key)
+{
+ item_key_by_coord(coord, key);
+ set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
+ return key;
+}
+
+/* plugin->u.item.s.file.init_coord_extension */
+void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
+{
+ uf_coord->valid = 1;
+}
+
+/*
+ plugin->u.item.s.file.get_block
+*/
+int
+get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
+{
+ assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
+
+ if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
+ /* if node has'nt obtainet its block number yet, return 0.
+ * Lets avoid upsetting users with some cosmic numbers beyond
+ * the device capacity.*/
+ *block = 0;
+ else
+ *block = *znode_get_block(coord->node);
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/item/tail.h linux-5.10.2/fs/reiser4/plugin/item/tail.h
--- linux-5.10.2.orig/fs/reiser4/plugin/item/tail.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/item/tail.h 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,59 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined( __REISER4_TAIL_H__ )
+#define __REISER4_TAIL_H__
+
+struct tail_coord_extension {
+ int not_used;
+};
+
+struct cut_list;
+
+/* plugin->u.item.b.* */
+reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
+int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
+ const reiser4_item_data *);
+int mergeable_tail(const coord_t * p1, const coord_t * p2);
+pos_in_node_t nr_units_tail(const coord_t *);
+lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
+int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
+int can_shift_tail(unsigned free_space, coord_t * source,
+ znode * target, shift_direction, unsigned *size,
+ unsigned want);
+void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
+ unsigned count, shift_direction, unsigned free_space);
+int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
+ struct carry_kill_data *);
+int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
+ struct carry_cut_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
+ struct carry_kill_data *, reiser4_key * smallest_removed,
+ reiser4_key * new_first);
+reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
+
+/* plugin->u.item.s.* */
+ssize_t write_tail_noreserve(struct file *file, struct inode * inode,
+ const char __user *buf, size_t count,
+ loff_t *pos);
+ssize_t write_tail_unix_file(struct file *file, struct inode * inode,
+ const char __user *buf, size_t count, loff_t *pos);
+int read_tail_unix_file(struct file *, flow_t *, hint_t *);
+int readpage_tail_unix_file(void *vp, struct page *page);
+reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
+void init_coord_extension_tail(uf_coord_t *, loff_t offset);
+int get_block_address_tail(const coord_t *, sector_t, sector_t *);
+
+/* __REISER4_TAIL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/Makefile linux-5.10.2/fs/reiser4/plugin/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/Makefile 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,28 @@
+obj-$(CONFIG_REISER4_FS) += plugins.o
+
+plugins-objs := \
+ plugin.o \
+ plugin_set.o \
+ object.o \
+ inode_ops.o \
+ inode_ops_rename.o \
+ file_ops.o \
+ file_ops_readdir.o \
+ file_plugin_common.o \
+ dir_plugin_common.o \
+ digest.o \
+ hash.o \
+ fibration.o \
+ tail_policy.o \
+ regular.o
+
+obj-$(CONFIG_REISER4_FS) += item/
+obj-$(CONFIG_REISER4_FS) += file/
+obj-$(CONFIG_REISER4_FS) += dir/
+obj-$(CONFIG_REISER4_FS) += node/
+obj-$(CONFIG_REISER4_FS) += compress/
+obj-$(CONFIG_REISER4_FS) += space/
+obj-$(CONFIG_REISER4_FS) += disk_format/
+obj-$(CONFIG_REISER4_FS) += security/
+obj-$(CONFIG_REISER4_FS) += volume/
+obj-$(CONFIG_REISER4_FS) += dst/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/Makefile linux-5.10.2/fs/reiser4/plugin/node/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/node/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/Makefile 2020-12-23 16:07:46.129813290 +0100
@@ -0,0 +1,6 @@
+obj-$(CONFIG_REISER4_FS) += node_plugins.o
+
+node_plugins-objs := \
+ node.o \
+ node40.o \
+ node41.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node40.c linux-5.10.2/fs/reiser4/plugin/node/node40.c
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node40.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node40.c 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,3142 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../debug.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "../plugin_header.h"
+#include "../item/item.h"
+#include "node.h"
+#include "node40.h"
+#include "../plugin.h"
+#include "../../jnode.h"
+#include "../../znode.h"
+#include "../../pool.h"
+#include "../../carry.h"
+#include "../../tap.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../../reiser4.h"
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/prefetch.h>
+
+/* leaf 40 format:
+
+ [node header | item 0, item 1, .., item N-1 | free space | item_head N-1, .. item_head 1, item head 0 ]
+ plugin_id (16) key
+ free_space (16) pluginid (16)
+ free_space_start (16) offset (16)
+ level (8)
+ num_items (16)
+ magic (32)
+ flush_time (32)
+*/
+/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs. Change to "ReIs". */
+/* magic number that is stored in ->magic field of node header */
+static const __u32 REISER4_NODE40_MAGIC = 0x52344653; /* (*(__u32 *)"R4FS"); */
+
+static int prepare_for_update(znode * left, znode * right,
+ carry_plugin_info * info);
+
+/* header of node of reiser40 format is at the beginning of node */
+static inline node40_header *node40_node_header(const znode * node /* node to
+ * query */ )
+{
+ assert("nikita-567", node != NULL);
+ assert("nikita-568", znode_page(node) != NULL);
+ assert("nikita-569", zdata(node) != NULL);
+ return (node40_header *) zdata(node);
+}
+
+/* functions to get/set fields of node40_header */
+#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
+#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
+#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
+#define nh40_get_level(nh) get_unaligned(&(nh)->level)
+#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
+#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
+
+#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
+#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
+#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
+#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
+#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
+#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
+
+/* plugin field of node header should be read/set by
+ plugin_by_disk_id/save_disk_plugin */
+
+/* array of item headers is at the end of node */
+static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
+{
+ return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
+}
+
+/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
+ */
+static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
+{
+ return (item_header40 *) (zdata(coord->node) +
+ znode_size(coord->node)) - (coord->item_pos) -
+ 1;
+}
+
+/* functions to get/set fields of item_header40 */
+#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
+
+#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
+
+/* plugin field of item header should be read/set by
+ plugin_by_disk_id/save_disk_plugin */
+
+/* plugin methods */
+
+/* plugin->u.node.item_overhead
+ look for description of this method in plugin/node/node.h */
+size_t
+item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
+{
+ return sizeof(item_header40);
+}
+
+/* plugin->u.node.free_space
+ look for description of this method in plugin/node/node.h */
+size_t free_space_node40(znode * node)
+{
+ assert("nikita-577", node != NULL);
+ assert("nikita-578", znode_is_loaded(node));
+ assert("nikita-579", zdata(node) != NULL);
+
+ return nh40_get_free_space(node40_node_header(node));
+}
+
+/* private inline version of node40_num_of_items() for use in this file. This
+ is necessary, because address of node40_num_of_items() is taken and it is
+ never inlined as a result. */
+static inline short node40_num_of_items_internal(const znode * node)
+{
+ return nh40_get_num_items(node40_node_header(node));
+}
+
+#if REISER4_DEBUG
+static inline void check_num_items(const znode * node)
+{
+ assert("nikita-2749",
+ node40_num_of_items_internal(node) == node->nr_items);
+ assert("nikita-2746", znode_is_write_locked(node));
+}
+#else
+#define check_num_items(node) noop
+#endif
+
+/* plugin->u.node.num_of_items
+ look for description of this method in plugin/node/node.h */
+int num_of_items_node40(const znode * node)
+{
+ return node40_num_of_items_internal(node);
+}
+
+static void
+node40_set_num_items(znode * node, node40_header * nh, unsigned value)
+{
+ assert("nikita-2751", node != NULL);
+ assert("nikita-2750", nh == node40_node_header(node));
+
+ check_num_items(node);
+ nh40_set_num_items(nh, value);
+ node->nr_items = value;
+ check_num_items(node);
+}
+
+/* plugin->u.node.item_by_coord
+ look for description of this method in plugin/node/node.h */
+char *item_by_coord_node40(const coord_t * coord)
+{
+ item_header40 *ih;
+ char *p;
+
+ /* @coord is set to existing item */
+ assert("nikita-596", coord != NULL);
+ assert("vs-255", coord_is_existing_item(coord));
+
+ ih = node40_ih_at_coord(coord);
+ p = zdata(coord->node) + ih40_get_offset(ih);
+ return p;
+}
+
+/* plugin->u.node.length_by_coord
+ look for description of this method in plugin/node/node.h */
+int length_by_coord_node40(const coord_t * coord)
+{
+ item_header40 *ih;
+ int result;
+
+ /* @coord is set to existing item */
+ assert("vs-256", coord != NULL);
+ assert("vs-257", coord_is_existing_item(coord));
+
+ ih = node40_ih_at_coord(coord);
+ if ((int)coord->item_pos ==
+ node40_num_of_items_internal(coord->node) - 1)
+ result =
+ nh40_get_free_space_start(node40_node_header(coord->node)) -
+ ih40_get_offset(ih);
+ else
+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
+
+ return result;
+}
+
+static pos_in_node_t
+node40_item_length(const znode * node, pos_in_node_t item_pos)
+{
+ item_header40 *ih;
+ pos_in_node_t result;
+
+ /* @coord is set to existing item */
+ assert("vs-256", node != NULL);
+ assert("vs-257", node40_num_of_items_internal(node) > item_pos);
+
+ ih = node40_ih_at(node, item_pos);
+ if (item_pos == node40_num_of_items_internal(node) - 1)
+ result =
+ nh40_get_free_space_start(node40_node_header(node)) -
+ ih40_get_offset(ih);
+ else
+ result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
+
+ return result;
+}
+
+/* plugin->u.node.plugin_by_coord
+ look for description of this method in plugin/node/node.h */
+item_plugin *plugin_by_coord_node40(const coord_t * coord)
+{
+ item_header40 *ih;
+ item_plugin *result;
+
+ /* @coord is set to existing item */
+ assert("vs-258", coord != NULL);
+ assert("vs-259", coord_is_existing_item(coord));
+
+ ih = node40_ih_at_coord(coord);
+ /* pass NULL in stead of current tree. This is time critical call. */
+ result = item_plugin_by_disk_id(&ih->plugin_id);
+ return result;
+}
+
+/* plugin->u.node.key_at
+ look for description of this method in plugin/node/node.h */
+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
+{
+ item_header40 *ih;
+
+ assert("nikita-1765", coord_is_existing_item(coord));
+
+ /* @coord is set to existing item */
+ ih = node40_ih_at_coord(coord);
+ memcpy(key, &ih->key, sizeof(reiser4_key));
+ return key;
+}
+
+/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
+
+#define NODE_INCSTAT(n, counter) \
+ reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
+
+#define NODE_ADDSTAT(n, counter, val) \
+ reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
+
+/* plugin->u.node.lookup
+ look for description of this method in plugin/node/node.h */
+node_search_result lookup_node40(znode * node /* node to query */ ,
+ const reiser4_key * key /* key to look for */ ,
+ lookup_bias bias /* search bias */ ,
+ coord_t * coord /* resulting coord */ )
+{
+ int left;
+ int right;
+ int found;
+ int items;
+
+ item_header40 *lefth;
+ item_header40 *righth;
+
+ item_plugin *iplug;
+ item_header40 *bstop;
+ item_header40 *ih;
+ cmp_t order;
+
+ assert("nikita-583", node != NULL);
+ assert("nikita-584", key != NULL);
+ assert("nikita-585", coord != NULL);
+ assert("nikita-2693", znode_is_any_locked(node));
+
+ items = node_num_items(node);
+
+ if (unlikely(items == 0)) {
+ coord_init_first_unit(coord, node);
+ return NS_NOT_FOUND;
+ }
+
+ /* binary search for item that can contain given key */
+ left = 0;
+ right = items - 1;
+ coord->node = node;
+ coord_clear_iplug(coord);
+ found = 0;
+
+ lefth = node40_ih_at(node, left);
+ righth = node40_ih_at(node, right);
+
+ /* It is known that for small arrays sequential search is on average
+ more efficient than binary. This is because sequential search is
+ coded as tight loop that can be better optimized by compilers and
+ for small array size gain from this optimization makes sequential
+ search the winner. Another, maybe more important, reason for this,
+ is that sequential array is more CPU cache friendly, whereas binary
+ search effectively destroys CPU caching.
+
+ Critical here is the notion of "smallness". Reasonable value of
+ REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
+ fs/reiser4/ulevel/ulevel.c:test_search().
+
+ Don't try to further optimize sequential search by scanning from
+ right to left in attempt to use more efficient loop termination
+ condition (comparison with 0). This doesn't work.
+
+ */
+
+ while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
+ int median;
+ item_header40 *medianh;
+
+ median = (left + right) / 2;
+ medianh = node40_ih_at(node, median);
+
+ assert("nikita-1084", median >= 0);
+ assert("nikita-1085", median < items);
+ switch (keycmp(key, &medianh->key)) {
+ case LESS_THAN:
+ right = median;
+ righth = medianh;
+ break;
+ default:
+ wrong_return_value("nikita-586", "keycmp");
+ case GREATER_THAN:
+ left = median;
+ lefth = medianh;
+ break;
+ case EQUAL_TO:
+ do {
+ --median;
+ /* headers are ordered from right to left */
+ ++medianh;
+ } while (median >= 0 && keyeq(key, &medianh->key));
+ right = left = median + 1;
+ ih = lefth = righth = medianh - 1;
+ found = 1;
+ break;
+ }
+ }
+ /* sequential scan. Item headers, and, therefore, keys are stored at
+ the rightmost part of a node from right to left. We are trying to
+ access memory from left to right, and hence, scan in _descending_
+ order of item numbers.
+ */
+ if (!found) {
+ for (left = right, ih = righth; left >= 0; ++ih, --left) {
+ cmp_t comparison;
+
+ prefetchkey(&(ih + 1)->key);
+ comparison = keycmp(&ih->key, key);
+ if (comparison == GREATER_THAN)
+ continue;
+ if (comparison == EQUAL_TO) {
+ found = 1;
+ do {
+ --left;
+ ++ih;
+ } while (left >= 0 && keyeq(&ih->key, key));
+ ++left;
+ --ih;
+ } else {
+ assert("nikita-1256", comparison == LESS_THAN);
+ }
+ break;
+ }
+ if (unlikely(left < 0))
+ left = 0;
+ }
+
+ assert("nikita-3212", right >= left);
+ assert("nikita-3214",
+ equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
+
+ coord_set_item_pos(coord, left);
+ coord->unit_pos = 0;
+ coord->between = AT_UNIT;
+
+ /* key < leftmost key in a mode or node is corrupted and keys
+ are not sorted */
+ bstop = node40_ih_at(node, (unsigned)left);
+ order = keycmp(&bstop->key, key);
+ if (unlikely(order == GREATER_THAN)) {
+ if (unlikely(left != 0)) {
+ /* screw up */
+ warning("nikita-587", "Key less than %i key in a node",
+ left);
+ reiser4_print_key("key", key);
+ reiser4_print_key("min", &bstop->key);
+ print_coord_content("coord", coord);
+ return RETERR(-EIO);
+ } else {
+ coord->between = BEFORE_UNIT;
+ return NS_NOT_FOUND;
+ }
+ }
+ /* left <= key, ok */
+ iplug = item_plugin_by_disk_id(&bstop->plugin_id);
+
+ if (unlikely(iplug == NULL)) {
+ warning("nikita-588", "Unknown plugin %i",
+ le16_to_cpu(get_unaligned(&bstop->plugin_id)));
+ reiser4_print_key("key", key);
+ print_coord_content("coord", coord);
+ return RETERR(-EIO);
+ }
+
+ coord_set_iplug(coord, iplug);
+
+ /* if exact key from item header was found by binary search, no
+ further checks are necessary. */
+ if (found) {
+ assert("nikita-1259", order == EQUAL_TO);
+ return NS_FOUND;
+ }
+ if (iplug->b.max_key_inside != NULL) {
+ reiser4_key max_item_key;
+
+ /* key > max_item_key --- outside of an item */
+ if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
+ coord->unit_pos = 0;
+ coord->between = AFTER_ITEM;
+ /* FIXME-VS: key we are looking for does not fit into
+ found item. Return NS_NOT_FOUND then. Without that
+ the following case does not work: there is extent of
+ file 10000, 10001. File 10000, 10002 has been just
+ created. When writing to position 0 in that file -
+ traverse_tree will stop here on twig level. When we
+ want it to go down to leaf level
+ */
+ return NS_NOT_FOUND;
+ }
+ }
+
+ if (iplug->b.lookup != NULL) {
+ return (node_search_result)iplug->b.lookup(key, bias, coord);
+ } else {
+ assert("nikita-1260", order == LESS_THAN);
+ coord->between = AFTER_UNIT;
+ return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
+ }
+}
+
+#undef NODE_ADDSTAT
+#undef NODE_INCSTAT
+
+/* plugin->u.node.estimate
+ look for description of this method in plugin/node/node.h */
+size_t estimate_node40(znode * node)
+{
+ size_t result;
+
+ assert("nikita-597", node != NULL);
+
+ result = free_space_node40(node) - sizeof(item_header40);
+
+ return (result > 0) ? result : 0;
+}
+
+/* plugin->u.node.check
+ look for description of this method in plugin/node/node.h */
+int check_node40(const znode * node /* node to check */ ,
+ __u32 flags /* check flags */ ,
+ const char **error /* where to store error message */ )
+{
+ reiser4_tree *tree;
+ int nr_items;
+ int i;
+ reiser4_key prev;
+ unsigned old_offset;
+ tree_level level;
+ coord_t coord;
+ int result;
+
+ assert("nikita-580", node != NULL);
+ assert("nikita-581", error != NULL);
+ assert("nikita-2948", znode_is_loaded(node));
+
+ if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
+ return 0;
+
+ assert("nikita-582", zdata(node) != NULL);
+
+ tree = znode_get_tree(node);
+
+ nr_items = node40_num_of_items_internal(node);
+ if (nr_items < 0) {
+ *error = "Negative number of items";
+ return -1;
+ }
+
+ if (flags & REISER4_NODE_DKEYS)
+ prev = *znode_get_ld_key((znode *) node);
+ else
+ prev = *reiser4_min_key();
+
+ old_offset = 0;
+ coord_init_zero(&coord);
+ coord.node = (znode *) node;
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ level = znode_get_level(node);
+ for (i = 0; i < nr_items; i++) {
+ item_header40 *ih;
+ reiser4_key unit_key;
+ unsigned j;
+
+ ih = node40_ih_at(node, (unsigned)i);
+ coord_set_item_pos(&coord, i);
+ if ((ih40_get_offset(ih) >=
+ znode_size(node) - nr_items * sizeof(item_header40)) ||
+ (ih40_get_offset(ih) < sizeof(node40_header))) {
+ *error = "Offset is out of bounds";
+ return -1;
+ }
+ if (ih40_get_offset(ih) <= old_offset) {
+ *error = "Offsets are in wrong order";
+ return -1;
+ }
+ if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
+ *error = "Wrong offset of first item";
+ return -1;
+ }
+ old_offset = ih40_get_offset(ih);
+
+ if (keygt(&prev, &ih->key)) {
+ *error = "Keys are in wrong order";
+ return -1;
+ }
+ if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
+ *error = "Wrong key of first unit";
+ return -1;
+ }
+ prev = ih->key;
+ for (j = 0; j < coord_num_units(&coord); ++j) {
+ coord.unit_pos = j;
+ unit_key_by_coord(&coord, &unit_key);
+ if (keygt(&prev, &unit_key)) {
+ *error = "Unit keys are in wrong order";
+ return -1;
+ }
+ prev = unit_key;
+ }
+ coord.unit_pos = 0;
+ if (level != TWIG_LEVEL && item_is_extent(&coord)) {
+ *error = "extent on the wrong level";
+ return -1;
+ }
+ if (level == LEAF_LEVEL && item_is_internal(&coord)) {
+ *error = "internal item on the wrong level";
+ return -1;
+ }
+ if (level != LEAF_LEVEL &&
+ !item_is_internal(&coord) && !item_is_extent(&coord)) {
+ *error = "wrong item on the internal level";
+ return -1;
+ }
+ if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
+ *error = "non-internal item on the internal level";
+ return -1;
+ }
+#if REISER4_DEBUG
+ if (item_plugin_by_coord(&coord)->b.check
+ && item_plugin_by_coord(&coord)->b.check(&coord, error))
+ return -1;
+#endif
+ if ((flags & REISER4_NODE_CHECK_MERGEABLE) && i) {
+ coord_t prev_coord;
+ /* two neighboring items can not be mergeable */
+ coord_dup(&prev_coord, &coord);
+ coord_prev_item(&prev_coord);
+ if (are_items_mergeable(&prev_coord, &coord)) {
+ *error = "mergeable items in one node";
+ return -1;
+ }
+
+ }
+ }
+ if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
+ coord_t coord;
+ reiser4_key mkey;
+
+ coord_init_last_unit(&coord, node);
+ max_item_key_by_coord(&coord, &mkey);
+
+ read_lock_dk(tree);
+ result = keygt(&mkey, znode_get_rd_key((znode *) node));
+ read_unlock_dk(tree);
+ if (result) {
+ *error = "key of rightmost item is too large";
+ return -1;
+ }
+ }
+ if (flags & REISER4_NODE_DKEYS) {
+ read_lock_tree();
+ read_lock_dk(tree);
+
+ flags |= REISER4_NODE_TREE_STABLE;
+
+ if (keygt(&prev, znode_get_rd_key((znode *) node))) {
+ if (flags & REISER4_NODE_TREE_STABLE) {
+ *error = "Last key is greater than rdkey";
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ return -1;
+ }
+ }
+ if (keygt
+ (znode_get_ld_key((znode *) node),
+ znode_get_rd_key((znode *) node))) {
+ *error = "ldkey is greater than rdkey";
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ return -1;
+ }
+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
+ (node->left != NULL) &&
+ !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
+ ergo(flags & REISER4_NODE_TREE_STABLE,
+ !keyeq(znode_get_rd_key(node->left),
+ znode_get_ld_key((znode *) node)))
+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
+ keygt(znode_get_rd_key(node->left),
+ znode_get_ld_key((znode *) node)))) {
+ *error = "left rdkey or ldkey is wrong";
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ return -1;
+ }
+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
+ (node->right != NULL) &&
+ !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
+ ergo(flags & REISER4_NODE_TREE_STABLE,
+ !keyeq(znode_get_rd_key((znode *) node),
+ znode_get_ld_key(node->right)))
+ && ergo(!(flags & REISER4_NODE_TREE_STABLE),
+ keygt(znode_get_rd_key((znode *) node),
+ znode_get_ld_key(node->right)))) {
+ *error = "rdkey or right ldkey is wrong";
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ return -1;
+ }
+
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ }
+
+ return 0;
+}
+
+int parse_node40_common(znode *node, const __u32 magic)
+{
+ node40_header *header;
+ int result;
+ d8 level;
+
+ header = node40_node_header((znode *) node);
+ result = -EIO;
+ level = nh40_get_level(header);
+ if (unlikely(((__u8) znode_get_level(node)) != level))
+ warning("nikita-494", "Wrong level found in node: %i != %i",
+ znode_get_level(node), level);
+ else if (unlikely(nh40_get_magic(header) != magic))
+ warning("nikita-495",
+ "Wrong magic in tree node: want %x, got %x",
+ magic, nh40_get_magic(header));
+ else {
+ node->nr_items = node40_num_of_items_internal(node);
+ result = 0;
+ }
+ return RETERR(result);
+}
+
+/*
+ * plugin->u.node.parse
+ * look for description of this method in plugin/node/node.h
+ */
+int parse_node40(znode *node /* node to parse */)
+{
+ return parse_node40_common(node, REISER4_NODE40_MAGIC);
+}
+
+/*
+ * common part of ->init_node() for all nodes,
+ * which contain node40_header at the beginning
+ */
+int init_node40_common(znode *node, node_plugin *nplug,
+ size_t node_header_size, const __u32 magic)
+{
+ node40_header *header40;
+
+ assert("nikita-570", node != NULL);
+ assert("nikita-572", zdata(node) != NULL);
+
+ header40 = node40_node_header(node);
+ memset(header40, 0, sizeof(node40_header));
+
+ nh40_set_free_space(header40, znode_size(node) - node_header_size);
+ nh40_set_free_space_start(header40, node_header_size);
+ /*
+ * sane hypothesis: 0 in CPU format is 0 in disk format
+ */
+ save_plugin_id(node_plugin_to_plugin(nplug),
+ &header40->common_header.plugin_id);
+ nh40_set_level(header40, znode_get_level(node));
+ nh40_set_magic(header40, magic);
+ nh40_set_mkfs_id(header40, reiser4_mkfs_id(reiser4_get_current_sb(),
+ znode_get_subvol(node)->id));
+ /*
+ * nr_items: 0
+ * flags: 0
+ */
+ return 0;
+}
+
+/*
+ * plugin->u.node.init
+ * look for description of this method in plugin/node/node.h
+ */
+int init_node40(znode *node /* node to initialise */)
+{
+ return init_node40_common(node, node_plugin_by_id(NODE40_ID),
+ sizeof(node40_header), REISER4_NODE40_MAGIC);
+}
+
+#ifdef GUESS_EXISTS
+int guess_node40_common(const znode *node, reiser4_node_id id,
+ const __u32 magic)
+{
+ node40_header *header;
+
+ assert("nikita-1058", node != NULL);
+ header = node40_node_header(node);
+ return (nh40_get_magic(header) == magic) &&
+ (id == plugin_by_disk_id(REISER4_NODE_PLUGIN_TYPE,
+ &header->common_header.plugin_id)->h.id);
+}
+
+int guess_node40(const znode *node /* node to guess plugin of */)
+{
+ return guess_node40_common(node, NODE40_ID, REISER4_NODE40_MAGIC);
+}
+#endif
+
+/* plugin->u.node.chage_item_size
+ look for description of this method in plugin/node/node.h */
+void change_item_size_node40(coord_t * coord, int by)
+{
+ node40_header *nh;
+ item_header40 *ih;
+ char *item_data;
+ int item_length;
+ unsigned i;
+
+ /* make sure that @item is coord of existing item */
+ assert("vs-210", coord_is_existing_item(coord));
+
+ nh = node40_node_header(coord->node);
+
+ item_data = item_by_coord_node40(coord);
+ item_length = length_by_coord_node40(coord);
+
+ /* move item bodies */
+ ih = node40_ih_at_coord(coord);
+ memmove(item_data + item_length + by, item_data + item_length,
+ nh40_get_free_space_start(nh) -
+ (ih40_get_offset(ih) + item_length));
+
+ /* update offsets of moved items */
+ for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
+ ih = node40_ih_at(coord->node, i);
+ ih40_set_offset(ih, ih40_get_offset(ih) + by);
+ }
+
+ /* update node header */
+ nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
+}
+
+static int should_notify_parent(const znode * node)
+{
+ /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
+ return !disk_addr_eq(znode_get_block(node),
+ &znode_get_tree(node)->root_block);
+}
+
+/* plugin->u.node.create_item
+ look for description of this method in plugin/node/node.h */
+int
+create_item_node40(coord_t *target, const reiser4_key *key,
+ reiser4_item_data *data, carry_plugin_info *info)
+{
+ node40_header *nh;
+ item_header40 *ih;
+ unsigned offset;
+ unsigned i;
+
+ nh = node40_node_header(target->node);
+
+ assert("vs-212", coord_is_between_items(target));
+ /* node must have enough free space */
+ assert("vs-254",
+ free_space_node40(target->node) >=
+ data->length + sizeof(item_header40));
+ assert("vs-1410", data->length >= 0);
+
+ if (coord_set_to_right(target))
+ /* there are not items to the right of @target, so, new item
+ will be inserted after last one */
+ coord_set_item_pos(target, nh40_get_num_items(nh));
+
+ if (target->item_pos < nh40_get_num_items(nh)) {
+ /* there are items to be moved to prepare space for new
+ item */
+ ih = node40_ih_at_coord(target);
+ /* new item will start at this offset */
+ offset = ih40_get_offset(ih);
+
+ memmove(zdata(target->node) + offset + data->length,
+ zdata(target->node) + offset,
+ nh40_get_free_space_start(nh) - offset);
+ /* update headers of moved items */
+ for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
+ ih = node40_ih_at(target->node, i);
+ ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
+ }
+
+ /* @ih is set to item header of the last item, move item headers */
+ memmove(ih - 1, ih,
+ sizeof(item_header40) * (nh40_get_num_items(nh) -
+ target->item_pos));
+ } else {
+ /* new item will start at this offset */
+ offset = nh40_get_free_space_start(nh);
+ }
+
+ /* make item header for the new item */
+ ih = node40_ih_at_coord(target);
+ memcpy(&ih->key, key, sizeof(reiser4_key));
+ ih40_set_offset(ih, offset);
+ save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
+
+ /* update node header */
+ nh40_set_free_space(nh,
+ nh40_get_free_space(nh) - data->length -
+ sizeof(item_header40));
+ nh40_set_free_space_start(nh,
+ nh40_get_free_space_start(nh) + data->length);
+ node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
+
+ /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
+ target->unit_pos = 0;
+ target->between = AT_UNIT;
+ coord_clear_iplug(target);
+
+ /* initialize item */
+ if (data->iplug->b.init != NULL) {
+ data->iplug->b.init(target, NULL, data);
+ }
+ /* copy item body */
+ if (data->iplug->b.paste != NULL) {
+ data->iplug->b.paste(target, data, info);
+ } else if (data->data != NULL) {
+ if (data->user) {
+ /* AUDIT: Are we really should not check that pointer
+ from userspace was valid and data bytes were
+ available? How will we return -EFAULT of some kind
+ without this check? */
+ assert("nikita-3038", reiser4_schedulable());
+ /* copy data from user space */
+ if (__copy_from_user(zdata(target->node) + offset,
+ (const char __user *)data->data,
+ (unsigned)data->length))
+ return RETERR(-EFAULT);
+ } else
+ /* copy from kernel space */
+ memcpy(zdata(target->node) + offset, data->data,
+ (unsigned)data->length);
+ }
+
+ if (target->item_pos == 0) {
+ /* left delimiting key has to be updated */
+ prepare_for_update(NULL, target->node, info);
+ }
+
+ if (item_plugin_by_coord(target)->b.create_hook != NULL) {
+ item_plugin_by_coord(target)->b.create_hook(target, data->arg);
+ }
+
+ return 0;
+}
+
+/* plugin->u.node.update_item_key
+ look for description of this method in plugin/node/node.h */
+void
+update_item_key_node40(coord_t * target, const reiser4_key * key,
+ carry_plugin_info * info)
+{
+ item_header40 *ih;
+
+ ih = node40_ih_at_coord(target);
+ memcpy(&ih->key, key, sizeof(reiser4_key));
+
+ if (target->item_pos == 0) {
+ prepare_for_update(NULL, target->node, info);
+ }
+}
+
+/* this bits encode cut mode */
+#define CMODE_TAIL 1
+#define CMODE_WHOLE 2
+#define CMODE_HEAD 4
+
+struct cut40_info {
+ int mode;
+ pos_in_node_t tail_removed; /* position of item which gets tail removed */
+ pos_in_node_t first_removed; /* position of first the leftmost item among items removed completely */
+ pos_in_node_t removed_count; /* number of items removed completely */
+ pos_in_node_t head_removed; /* position of item which gets head removed */
+
+ pos_in_node_t freed_space_start;
+ pos_in_node_t freed_space_end;
+ pos_in_node_t first_moved;
+ pos_in_node_t head_removed_location;
+};
+
+static void init_cinfo(struct cut40_info *cinfo)
+{
+ cinfo->mode = 0;
+ cinfo->tail_removed = MAX_POS_IN_NODE;
+ cinfo->first_removed = MAX_POS_IN_NODE;
+ cinfo->removed_count = MAX_POS_IN_NODE;
+ cinfo->head_removed = MAX_POS_IN_NODE;
+ cinfo->freed_space_start = MAX_POS_IN_NODE;
+ cinfo->freed_space_end = MAX_POS_IN_NODE;
+ cinfo->first_moved = MAX_POS_IN_NODE;
+ cinfo->head_removed_location = MAX_POS_IN_NODE;
+}
+
+/* complete cut_node40/kill_node40 content by removing the gap created by */
+static void compact(znode * node, struct cut40_info *cinfo)
+{
+ node40_header *nh;
+ item_header40 *ih;
+ pos_in_node_t freed;
+ pos_in_node_t pos, nr_items;
+
+ assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
+ cinfo->freed_space_end != MAX_POS_IN_NODE &&
+ cinfo->first_moved != MAX_POS_IN_NODE));
+ assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
+
+ nh = node40_node_header(node);
+ nr_items = nh40_get_num_items(nh);
+
+ /* remove gap made up by removal */
+ memmove(zdata(node) + cinfo->freed_space_start,
+ zdata(node) + cinfo->freed_space_end,
+ nh40_get_free_space_start(nh) - cinfo->freed_space_end);
+
+ /* update item headers of moved items - change their locations */
+ pos = cinfo->first_moved;
+ ih = node40_ih_at(node, pos);
+ if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
+ assert("vs-1580", pos == cinfo->head_removed);
+ ih40_set_offset(ih, cinfo->head_removed_location);
+ pos++;
+ ih--;
+ }
+
+ freed = cinfo->freed_space_end - cinfo->freed_space_start;
+ for (; pos < nr_items; pos++, ih--) {
+ assert("vs-1581", ih == node40_ih_at(node, pos));
+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
+ }
+
+ /* free space start moved to right */
+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
+
+ if (cinfo->removed_count != MAX_POS_IN_NODE) {
+ /* number of items changed. Remove item headers of those items */
+ ih = node40_ih_at(node, nr_items - 1);
+ memmove(ih + cinfo->removed_count, ih,
+ sizeof(item_header40) * (nr_items -
+ cinfo->removed_count -
+ cinfo->first_removed));
+ freed += sizeof(item_header40) * cinfo->removed_count;
+ node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
+ }
+
+ /* total amount of free space increased */
+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
+}
+
+int shrink_item_node40(coord_t * coord, int delta)
+{
+ node40_header *nh;
+ item_header40 *ih;
+ pos_in_node_t pos;
+ pos_in_node_t nr_items;
+ char *end;
+ znode *node;
+ int off;
+
+ assert("nikita-3487", coord != NULL);
+ assert("nikita-3488", delta >= 0);
+
+ node = coord->node;
+ nh = node40_node_header(node);
+ nr_items = nh40_get_num_items(nh);
+
+ ih = node40_ih_at_coord(coord);
+ assert("nikita-3489", delta <= length_by_coord_node40(coord));
+ off = ih40_get_offset(ih) + length_by_coord_node40(coord);
+ end = zdata(node) + off;
+
+ /* remove gap made up by removal */
+ memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
+
+ /* update item headers of moved items - change their locations */
+ pos = coord->item_pos + 1;
+ ih = node40_ih_at(node, pos);
+ for (; pos < nr_items; pos++, ih--) {
+ assert("nikita-3490", ih == node40_ih_at(node, pos));
+ ih40_set_offset(ih, ih40_get_offset(ih) - delta);
+ }
+
+ /* free space start moved to left */
+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
+ /* total amount of free space increased */
+ nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
+ /*
+ * This method does _not_ changes number of items. Hence, it cannot
+ * make node empty. Also it doesn't remove items at all, which means
+ * that no keys have to be updated either.
+ */
+ return 0;
+}
+
+/*
+ * Evaluate cut mode, if key range has been specified.
+ *
+ * This is for the case when units are not minimal objects
+ * addressed by keys.
+ *
+ * This doesn't work when range contains objects with
+ * non-unique keys (e.g. directory items).
+ */
+static int parse_cut_by_key_range(struct cut40_info *cinfo,
+ const struct cut_kill_params *params)
+{
+ reiser4_key min_from_key, max_to_key;
+ const reiser4_key *from_key = params->from_key;
+ const reiser4_key *to_key = params->to_key;
+ /*
+ * calculate minimal key stored in first item
+ * of items to be cut (params->from)
+ */
+ item_key_by_coord(params->from, &min_from_key);
+ /*
+ * calculate maximal key stored in last item
+ * of items to be cut (params->to)
+ */
+ max_item_key_by_coord(params->to, &max_to_key);
+
+ if (params->from->item_pos == params->to->item_pos) {
+ if (keylt(&min_from_key, from_key)
+ && keylt(to_key, &max_to_key))
+ return 1;
+
+ if (keygt(from_key, &min_from_key)) {
+ /* tail of item is to be cut cut */
+ cinfo->tail_removed = params->from->item_pos;
+ cinfo->mode |= CMODE_TAIL;
+ } else if (keylt(to_key, &max_to_key)) {
+ /* head of item is to be cut */
+ cinfo->head_removed = params->from->item_pos;
+ cinfo->mode |= CMODE_HEAD;
+ } else {
+ /* item is removed completely */
+ cinfo->first_removed = params->from->item_pos;
+ cinfo->removed_count = 1;
+ cinfo->mode |= CMODE_WHOLE;
+ }
+ } else {
+ cinfo->first_removed = params->from->item_pos + 1;
+ cinfo->removed_count =
+ params->to->item_pos - params->from->item_pos - 1;
+
+ if (keygt(from_key, &min_from_key)) {
+ /* first item is not cut completely */
+ cinfo->tail_removed = params->from->item_pos;
+ cinfo->mode |= CMODE_TAIL;
+ } else {
+ cinfo->first_removed--;
+ cinfo->removed_count++;
+ }
+ if (keylt(to_key, &max_to_key)) {
+ /* last item is not cut completely */
+ cinfo->head_removed = params->to->item_pos;
+ cinfo->mode |= CMODE_HEAD;
+ } else {
+ cinfo->removed_count++;
+ }
+ if (cinfo->removed_count)
+ cinfo->mode |= CMODE_WHOLE;
+ }
+ return 0;
+}
+
+/*
+ * Evaluate cut mode, if the key range hasn't been specified.
+ * In this case the range can include objects with non-unique
+ * keys (e.g. directory entries).
+ *
+ * This doesn't work when units are not the minimal objects
+ * addressed by keys (e.g. bytes in file's body stored in
+ * unformatted nodes).
+ */
+static int parse_cut_by_coord_range(struct cut40_info *cinfo,
+ const struct cut_kill_params *params)
+{
+ coord_t *from = params->from;
+ coord_t *to = params->to;
+
+ if (from->item_pos == to->item_pos) {
+ /*
+ * cut is performed on only one item
+ */
+ if (from->unit_pos > 0 &&
+ to->unit_pos < coord_last_unit_pos(to))
+ /*
+ * cut from the middle of item
+ */
+ return 1;
+ if (from->unit_pos > 0) {
+ /*
+ * tail of item is to be cut
+ */
+ cinfo->tail_removed = params->from->item_pos;
+ cinfo->mode |= CMODE_TAIL;
+ } else if (to->unit_pos < coord_last_unit_pos(to)) {
+ /*
+ * head of item is to be cut
+ */
+ cinfo->head_removed = params->from->item_pos;
+ cinfo->mode |= CMODE_HEAD;
+ } else {
+ /*
+ * item is removed completely
+ */
+ assert("edward-1631",
+ from->unit_pos == 0 &&
+ to->unit_pos == coord_last_unit_pos(to));
+
+ cinfo->first_removed = params->from->item_pos;
+ cinfo->removed_count = 1;
+ cinfo->mode |= CMODE_WHOLE;
+ }
+ } else {
+ cinfo->first_removed = from->item_pos + 1;
+ cinfo->removed_count =
+ to->item_pos - from->item_pos - 1;
+
+ if (from->unit_pos > 0) {
+ /*
+ * first item is not cut completely
+ */
+ cinfo->tail_removed = from->item_pos;
+ cinfo->mode |= CMODE_TAIL;
+ } else {
+ cinfo->first_removed--;
+ cinfo->removed_count++;
+ }
+ if (to->unit_pos < coord_last_unit_pos(to)) {
+ /*
+ * last item is not cut completely
+ */
+ cinfo->head_removed = to->item_pos;
+ cinfo->mode |= CMODE_HEAD;
+ } else {
+ cinfo->removed_count++;
+ }
+ if (cinfo->removed_count)
+ cinfo->mode |= CMODE_WHOLE;
+ }
+ return 0;
+}
+
+/*
+ * this is used by cut_node40 and kill_node40. It analyses input parameters
+ * and calculates cut mode. There are 2 types of cut. First is when a unit is
+ * removed from the middle of an item. In this case this function returns 1.
+ * All the rest fits into second case: 0 or 1 of items getting tail cut, 0 or
+ * more items removed completely and 0 or 1 item getting head cut. Function
+ * returns 0 in this case
+ */
+static int parse_cut(struct cut40_info *cinfo,
+ const struct cut_kill_params *params)
+{
+ init_cinfo(cinfo);
+ if (params->from_key == NULL) {
+ /*
+ * cut key range is not defined in input parameters
+ */
+ assert("vs-1513", params->to_key == NULL);
+ return parse_cut_by_coord_range(cinfo, params);
+ } else
+ return parse_cut_by_key_range(cinfo, params);
+}
+
+static void
+call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
+ carry_kill_data * kdata)
+{
+ coord_t coord;
+ item_plugin *iplug;
+ pos_in_node_t pos;
+
+ coord.node = node;
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ for (pos = 0; pos < count; pos++) {
+ coord_set_item_pos(&coord, from + pos);
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ iplug = item_plugin_by_coord(&coord);
+ if (iplug->b.kill_hook) {
+ iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
+ kdata);
+ }
+ }
+}
+
+/* this is used to kill item partially */
+static pos_in_node_t
+kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
+{
+ struct carry_kill_data *kdata;
+ item_plugin *iplug;
+
+ kdata = data;
+ iplug = item_plugin_by_coord(coord);
+
+ assert("vs-1524", iplug->b.kill_units);
+ return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
+ new_first_key);
+}
+
+/* call item plugin to cut tail of file */
+static pos_in_node_t
+kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
+{
+ struct carry_kill_data *kdata;
+ pos_in_node_t to;
+
+ kdata = data;
+ to = coord_last_unit_pos(coord);
+ return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
+ NULL);
+}
+
+/* call item plugin to cut head of item */
+static pos_in_node_t
+kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
+ reiser4_key * new_first_key)
+{
+ return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
+ new_first_key);
+}
+
+/* this is used to cut item partially */
+static pos_in_node_t
+cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
+ reiser4_key * smallest_removed, reiser4_key * new_first_key)
+{
+ carry_cut_data *cdata;
+ item_plugin *iplug;
+
+ cdata = data;
+ iplug = item_plugin_by_coord(coord);
+ assert("vs-302", iplug->b.cut_units);
+ return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
+ new_first_key);
+}
+
+/* call item plugin to cut tail of file */
+static pos_in_node_t
+cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
+{
+ carry_cut_data *cdata;
+ pos_in_node_t to;
+
+ cdata = data;
+ to = coord_last_unit_pos(cdata->params.from);
+ return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
+}
+
+/* call item plugin to cut head of item */
+static pos_in_node_t
+cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
+ reiser4_key * new_first_key)
+{
+ return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
+ new_first_key);
+}
+
+/* this returns 1 of key of first item changed, 0 - if it did not */
+static int
+prepare_for_compact(struct cut40_info *cinfo,
+ const struct cut_kill_params *params, int is_cut,
+ void *data, carry_plugin_info * info)
+{
+ znode *node;
+ item_header40 *ih;
+ pos_in_node_t freed;
+ pos_in_node_t item_pos;
+ coord_t coord;
+ reiser4_key new_first_key;
+ pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
+ void *, reiser4_key *, reiser4_key *);
+ pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
+ pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
+ reiser4_key *);
+ int retval;
+
+ retval = 0;
+
+ node = params->from->node;
+
+ assert("vs-184", node == params->to->node);
+ assert("vs-312", !node_is_empty(node));
+ assert("vs-297",
+ coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
+
+ if (is_cut) {
+ kill_units_f = cut_units;
+ kill_tail_f = cut_tail;
+ kill_head_f = cut_head;
+ } else {
+ kill_units_f = kill_units;
+ kill_tail_f = kill_tail;
+ kill_head_f = kill_head;
+ }
+
+ if (parse_cut(cinfo, params) == 1) {
+ /* cut from the middle of item */
+ freed =
+ kill_units_f(params->from, params->from->unit_pos,
+ params->to->unit_pos, data,
+ params->smallest_removed, NULL);
+
+ item_pos = params->from->item_pos;
+ ih = node40_ih_at(node, item_pos);
+ cinfo->freed_space_start =
+ ih40_get_offset(ih) + node40_item_length(node,
+ item_pos) - freed;
+ cinfo->freed_space_end = cinfo->freed_space_start + freed;
+ cinfo->first_moved = item_pos + 1;
+ } else {
+ assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
+ cinfo->first_removed != MAX_POS_IN_NODE ||
+ cinfo->head_removed != MAX_POS_IN_NODE));
+
+ switch (cinfo->mode) {
+ case CMODE_TAIL:
+ /* one item gets cut partially from its end */
+ assert("vs-1562",
+ cinfo->tail_removed == params->from->item_pos);
+
+ freed =
+ kill_tail_f(params->from, data,
+ params->smallest_removed);
+
+ item_pos = cinfo->tail_removed;
+ ih = node40_ih_at(node, item_pos);
+ cinfo->freed_space_start =
+ ih40_get_offset(ih) + node40_item_length(node,
+ item_pos) -
+ freed;
+ cinfo->freed_space_end =
+ cinfo->freed_space_start + freed;
+ cinfo->first_moved = cinfo->tail_removed + 1;
+ break;
+
+ case CMODE_WHOLE:
+ /* one or more items get removed completely */
+ assert("vs-1563",
+ cinfo->first_removed == params->from->item_pos);
+ assert("vs-1564", cinfo->removed_count > 0
+ && cinfo->removed_count != MAX_POS_IN_NODE);
+
+ /* call kill hook for all items removed completely */
+ if (is_cut == 0)
+ call_kill_hooks(node, cinfo->first_removed,
+ cinfo->removed_count, data);
+
+ item_pos = cinfo->first_removed;
+ ih = node40_ih_at(node, item_pos);
+
+ if (params->smallest_removed)
+ memcpy(params->smallest_removed, &ih->key,
+ sizeof(reiser4_key));
+
+ cinfo->freed_space_start = ih40_get_offset(ih);
+
+ item_pos += (cinfo->removed_count - 1);
+ ih -= (cinfo->removed_count - 1);
+ cinfo->freed_space_end =
+ ih40_get_offset(ih) + node40_item_length(node,
+ item_pos);
+ cinfo->first_moved = item_pos + 1;
+ if (cinfo->first_removed == 0)
+ /* key of first item of the node changes */
+ retval = 1;
+ break;
+
+ case CMODE_HEAD:
+ /* one item gets cut partially from its head */
+ assert("vs-1565",
+ cinfo->head_removed == params->from->item_pos);
+
+ freed =
+ kill_head_f(params->to, data,
+ params->smallest_removed,
+ &new_first_key);
+
+ item_pos = cinfo->head_removed;
+ ih = node40_ih_at(node, item_pos);
+ cinfo->freed_space_start = ih40_get_offset(ih);
+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
+ cinfo->first_moved = cinfo->head_removed + 1;
+
+ /* item head is removed, therefore, item key changed */
+ coord.node = node;
+ coord_set_item_pos(&coord, item_pos);
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ update_item_key_node40(&coord, &new_first_key, NULL);
+ if (item_pos == 0)
+ /* key of first item of the node changes */
+ retval = 1;
+ break;
+
+ case CMODE_TAIL | CMODE_WHOLE:
+ /* one item gets cut from its end and one or more items get removed completely */
+ assert("vs-1566",
+ cinfo->tail_removed == params->from->item_pos);
+ assert("vs-1567",
+ cinfo->first_removed == cinfo->tail_removed + 1);
+ assert("vs-1564", cinfo->removed_count > 0
+ && cinfo->removed_count != MAX_POS_IN_NODE);
+
+ freed =
+ kill_tail_f(params->from, data,
+ params->smallest_removed);
+
+ item_pos = cinfo->tail_removed;
+ ih = node40_ih_at(node, item_pos);
+ cinfo->freed_space_start =
+ ih40_get_offset(ih) + node40_item_length(node,
+ item_pos) -
+ freed;
+
+ /* call kill hook for all items removed completely */
+ if (is_cut == 0)
+ call_kill_hooks(node, cinfo->first_removed,
+ cinfo->removed_count, data);
+
+ item_pos += cinfo->removed_count;
+ ih -= cinfo->removed_count;
+ cinfo->freed_space_end =
+ ih40_get_offset(ih) + node40_item_length(node,
+ item_pos);
+ cinfo->first_moved = item_pos + 1;
+ break;
+
+ case CMODE_WHOLE | CMODE_HEAD:
+ /* one or more items get removed completely and one item gets cut partially from its head */
+ assert("vs-1568",
+ cinfo->first_removed == params->from->item_pos);
+ assert("vs-1564", cinfo->removed_count > 0
+ && cinfo->removed_count != MAX_POS_IN_NODE);
+ assert("vs-1569",
+ cinfo->head_removed ==
+ cinfo->first_removed + cinfo->removed_count);
+
+ /* call kill hook for all items removed completely */
+ if (is_cut == 0)
+ call_kill_hooks(node, cinfo->first_removed,
+ cinfo->removed_count, data);
+
+ item_pos = cinfo->first_removed;
+ ih = node40_ih_at(node, item_pos);
+
+ if (params->smallest_removed)
+ memcpy(params->smallest_removed, &ih->key,
+ sizeof(reiser4_key));
+
+ freed =
+ kill_head_f(params->to, data, NULL, &new_first_key);
+
+ cinfo->freed_space_start = ih40_get_offset(ih);
+
+ ih = node40_ih_at(node, cinfo->head_removed);
+ /* this is the most complex case. Item which got head removed and items which are to be moved
+ intact change their location differently. */
+ cinfo->freed_space_end = ih40_get_offset(ih) + freed;
+ cinfo->first_moved = cinfo->head_removed;
+ cinfo->head_removed_location = cinfo->freed_space_start;
+
+ /* item head is removed, therefore, item key changed */
+ coord.node = node;
+ coord_set_item_pos(&coord, cinfo->head_removed);
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ update_item_key_node40(&coord, &new_first_key, NULL);
+
+ assert("vs-1579", cinfo->first_removed == 0);
+ /* key of first item of the node changes */
+ retval = 1;
+ break;
+
+ case CMODE_TAIL | CMODE_HEAD:
+ /* one item get cut from its end and its neighbor gets cut from its tail */
+ impossible("vs-1576", "this can not happen currently");
+ break;
+
+ case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
+ impossible("vs-1577", "this can not happen currently");
+ break;
+ default:
+ impossible("vs-1578", "unexpected cut mode");
+ break;
+ }
+ }
+ return retval;
+}
+
+/* plugin->u.node.kill
+ return value is number of items removed completely */
+int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
+{
+ znode *node;
+ struct cut40_info cinfo;
+ int first_key_changed;
+
+ node = kdata->params.from->node;
+
+ first_key_changed =
+ prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
+ info);
+ compact(node, &cinfo);
+
+ if (info) {
+ /* it is not called by node40_shift, so we have to take care
+ of changes on upper levels */
+ if (node_is_empty(node)
+ && !(kdata->flags & DELETE_RETAIN_EMPTY))
+ /* all contents of node is deleted */
+ prepare_removal_node40(node, info);
+ else if (first_key_changed) {
+ prepare_for_update(NULL, node, info);
+ }
+ }
+
+ coord_clear_iplug(kdata->params.from);
+ coord_clear_iplug(kdata->params.to);
+
+ znode_make_dirty(node);
+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
+}
+
+/* plugin->u.node.cut
+ return value is number of items removed completely */
+int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
+{
+ znode *node;
+ struct cut40_info cinfo;
+ int first_key_changed;
+
+ node = cdata->params.from->node;
+
+ first_key_changed =
+ prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
+ info);
+ compact(node, &cinfo);
+
+ if (info) {
+ /* it is not called by node40_shift, so we have to take care
+ of changes on upper levels */
+ if (node_is_empty(node))
+ /* all contents of node is deleted */
+ prepare_removal_node40(node, info);
+ else if (first_key_changed) {
+ prepare_for_update(NULL, node, info);
+ }
+ }
+
+ coord_clear_iplug(cdata->params.from);
+ coord_clear_iplug(cdata->params.to);
+
+ znode_make_dirty(node);
+ return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
+}
+
+/* this structure is used by shift method of node40 plugin */
+struct shift_params {
+ shift_direction pend; /* when @pend == append - we are shifting to
+ left, when @pend == prepend - to right */
+ coord_t wish_stop; /* when shifting to left this is last unit we
+ want shifted, when shifting to right - this
+ is set to unit we want to start shifting
+ from */
+ znode *target;
+ int everything; /* it is set to 1 if everything we have to shift is
+ shifted, 0 - otherwise */
+
+ /* FIXME-VS: get rid of read_stop */
+
+ /* these are set by estimate_shift */
+ coord_t real_stop; /* this will be set to last unit which will be
+ really shifted */
+
+ /* coordinate in source node before operation of unit which becomes
+ first after shift to left of last after shift to right */
+ union {
+ coord_t future_first;
+ coord_t future_last;
+ } u;
+
+ unsigned merging_units; /* number of units of first item which have to
+ be merged with last item of target node */
+ unsigned merging_bytes; /* number of bytes in those units */
+
+ unsigned entire; /* items shifted in their entirety */
+ unsigned entire_bytes; /* number of bytes in those items */
+
+ unsigned part_units; /* number of units of partially copied item */
+ unsigned part_bytes; /* number of bytes in those units */
+
+ unsigned shift_bytes; /* total number of bytes in items shifted (item
+ headers not included) */
+
+};
+
+static int item_creation_overhead(coord_t *item)
+{
+ return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
+}
+
+/* how many units are there in @source starting from source->unit_pos
+ but not further than @stop_coord */
+static int
+wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
+{
+ if (pend == SHIFT_LEFT) {
+ assert("vs-181", source->unit_pos == 0);
+ } else {
+ assert("vs-182",
+ source->unit_pos == coord_last_unit_pos(source));
+ }
+
+ if (source->item_pos != stop_coord->item_pos) {
+ /* @source and @stop_coord are different items */
+ return coord_last_unit_pos(source) + 1;
+ }
+
+ if (pend == SHIFT_LEFT) {
+ return stop_coord->unit_pos + 1;
+ } else {
+ return source->unit_pos - stop_coord->unit_pos + 1;
+ }
+}
+
+/* this calculates what can be copied from @shift->wish_stop.node to
+ @shift->target */
+static void
+estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
+{
+ unsigned target_free_space, size;
+ pos_in_node_t stop_item; /* item which estimating should not consider */
+ unsigned want; /* number of units of item we want shifted */
+ coord_t source; /* item being estimated */
+ item_plugin *iplug;
+
+ /* shifting to left/right starts from first/last units of
+ @shift->wish_stop.node */
+ if (shift->pend == SHIFT_LEFT) {
+ coord_init_first_unit(&source, shift->wish_stop.node);
+ } else {
+ coord_init_last_unit(&source, shift->wish_stop.node);
+ }
+ shift->real_stop = source;
+
+ /* free space in target node and number of items in source */
+ target_free_space = znode_free_space(shift->target);
+
+ shift->everything = 0;
+ if (!node_is_empty(shift->target)) {
+ /* target node is not empty, check for boundary items
+ mergeability */
+ coord_t to;
+
+ /* item we try to merge @source with */
+ if (shift->pend == SHIFT_LEFT) {
+ coord_init_last_unit(&to, shift->target);
+ } else {
+ coord_init_first_unit(&to, shift->target);
+ }
+
+ if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
+ &source) :
+ are_items_mergeable(&source, &to)) {
+ /* how many units of @source do we want to merge to
+ item @to */
+ want =
+ wanted_units(&source, &shift->wish_stop,
+ shift->pend);
+
+ /* how many units of @source we can merge to item
+ @to */
+ iplug = item_plugin_by_coord(&source);
+ if (iplug->b.can_shift != NULL)
+ shift->merging_units =
+ iplug->b.can_shift(target_free_space,
+ &source, shift->target,
+ shift->pend, &size,
+ want);
+ else {
+ shift->merging_units = 0;
+ size = 0;
+ }
+ shift->merging_bytes = size;
+ shift->shift_bytes += size;
+ /* update stop coord to be set to last unit of @source
+ we can merge to @target */
+ if (shift->merging_units)
+ /* at least one unit can be shifted */
+ shift->real_stop.unit_pos =
+ (shift->merging_units - source.unit_pos -
+ 1) * shift->pend;
+ else {
+ /* nothing can be shifted */
+ if (shift->pend == SHIFT_LEFT)
+ coord_init_before_first_item(&shift->
+ real_stop,
+ source.
+ node);
+ else
+ coord_init_after_last_item(&shift->
+ real_stop,
+ source.node);
+ }
+ assert("nikita-2081", shift->real_stop.unit_pos + 1);
+
+ if (shift->merging_units != want) {
+ /* we could not copy as many as we want, so,
+ there is no reason for estimating any
+ longer */
+ return;
+ }
+
+ target_free_space -= size;
+ coord_add_item_pos(&source, shift->pend);
+ }
+ }
+
+ /* number of item nothing of which we want to shift */
+ stop_item = shift->wish_stop.item_pos + shift->pend;
+
+ /* calculate how many items can be copied into given free
+ space as whole */
+ for (; source.item_pos != stop_item;
+ coord_add_item_pos(&source, shift->pend)) {
+ if (shift->pend == SHIFT_RIGHT)
+ source.unit_pos = coord_last_unit_pos(&source);
+
+ /* how many units of @source do we want to copy */
+ want = wanted_units(&source, &shift->wish_stop, shift->pend);
+
+ if (want == coord_last_unit_pos(&source) + 1) {
+ /* we want this item to be copied entirely */
+ size =
+ item_length_by_coord(&source) +
+ item_creation_overhead(&source);
+ if (size <= target_free_space) {
+ /* item fits into target node as whole */
+ target_free_space -= size;
+ shift->shift_bytes +=
+ size - item_creation_overhead(&source);
+ shift->entire_bytes +=
+ size - item_creation_overhead(&source);
+ shift->entire++;
+
+ /* update shift->real_stop coord to be set to
+ last unit of @source we can merge to
+ @target */
+ shift->real_stop = source;
+ if (shift->pend == SHIFT_LEFT)
+ shift->real_stop.unit_pos =
+ coord_last_unit_pos(&shift->
+ real_stop);
+ else
+ shift->real_stop.unit_pos = 0;
+ continue;
+ }
+ }
+
+ /* we reach here only for an item which does not fit into
+ target node in its entirety. This item may be either
+ partially shifted, or not shifted at all. We will have to
+ create new item in target node, so decrease amout of free
+ space by an item creation overhead. We can reach here also
+ if stop coord is in this item */
+ if (target_free_space >=
+ (unsigned)item_creation_overhead(&source)) {
+ target_free_space -= item_creation_overhead(&source);
+ iplug = item_plugin_by_coord(&source);
+ if (iplug->b.can_shift) {
+ shift->part_units = iplug->b.can_shift(target_free_space,
+ &source,
+ NULL, /* target */
+ shift->pend,
+ &size,
+ want);
+ } else {
+ target_free_space = 0;
+ shift->part_units = 0;
+ size = 0;
+ }
+ } else {
+ target_free_space = 0;
+ shift->part_units = 0;
+ size = 0;
+ }
+ shift->part_bytes = size;
+ shift->shift_bytes += size;
+
+ /* set @shift->real_stop to last unit of @source we can merge
+ to @shift->target */
+ if (shift->part_units) {
+ shift->real_stop = source;
+ shift->real_stop.unit_pos =
+ (shift->part_units - source.unit_pos -
+ 1) * shift->pend;
+ assert("nikita-2082", shift->real_stop.unit_pos + 1);
+ }
+
+ if (want != shift->part_units)
+ /* not everything wanted were shifted */
+ return;
+ break;
+ }
+
+ shift->everything = 1;
+}
+
+static void
+copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
+ shift_direction dir, unsigned free_space)
+{
+ item_plugin *iplug;
+
+ assert("nikita-1463", target != NULL);
+ assert("nikita-1464", source != NULL);
+ assert("nikita-1465", from + count <= coord_num_units(source));
+
+ iplug = item_plugin_by_coord(source);
+ assert("nikita-1468", iplug == item_plugin_by_coord(target));
+ iplug->b.copy_units(target, source, from, count, dir, free_space);
+
+ if (dir == SHIFT_RIGHT) {
+ /* FIXME-VS: this looks not necessary. update_item_key was
+ called already by copy_units method */
+ reiser4_key split_key;
+
+ assert("nikita-1469", target->unit_pos == 0);
+
+ unit_key_by_coord(target, &split_key);
+ node_plugin_by_coord(target)->update_item_key(target,
+ &split_key, NULL);
+ }
+}
+
+/* copy part of @shift->real_stop.node starting either from its beginning or
+ from its end and ending at @shift->real_stop to either the end or the
+ beginning of @shift->target */
+static void copy(struct shift_params *shift, size_t node_header_size)
+{
+ node40_header *nh;
+ coord_t from;
+ coord_t to;
+ item_header40 *from_ih, *to_ih;
+ int free_space_start;
+ int new_items;
+ unsigned old_items;
+ int old_offset;
+ unsigned i;
+
+ nh = node40_node_header(shift->target);
+ free_space_start = nh40_get_free_space_start(nh);
+ old_items = nh40_get_num_items(nh);
+ new_items = shift->entire + (shift->part_units ? 1 : 0);
+ assert("vs-185",
+ shift->shift_bytes ==
+ shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
+
+ from = shift->wish_stop;
+
+ coord_init_first_unit(&to, shift->target);
+
+ /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
+ hence to.between is set to EMPTY_NODE above. Looks like we want it
+ to be AT_UNIT.
+
+ Oh, wonders of ->betweeness...
+
+ */
+ to.between = AT_UNIT;
+
+ if (shift->pend == SHIFT_LEFT) {
+ /* copying to left */
+
+ coord_set_item_pos(&from, 0);
+ from_ih = node40_ih_at(from.node, 0);
+
+ coord_set_item_pos(&to,
+ node40_num_of_items_internal(to.node) - 1);
+ if (shift->merging_units) {
+ /* expand last item, so that plugin methods will see
+ correct data */
+ free_space_start += shift->merging_bytes;
+ nh40_set_free_space_start(nh,
+ (unsigned)free_space_start);
+ nh40_set_free_space(nh,
+ nh40_get_free_space(nh) -
+ shift->merging_bytes);
+
+ /* appending last item of @target */
+ copy_units(&to, &from, 0, /* starting from 0-th unit */
+ shift->merging_units, SHIFT_LEFT,
+ shift->merging_bytes);
+ coord_inc_item_pos(&from);
+ from_ih--;
+ coord_inc_item_pos(&to);
+ }
+
+ to_ih = node40_ih_at(shift->target, old_items);
+ if (shift->entire) {
+ /* copy @entire items entirely */
+
+ /* copy item headers */
+ memcpy(to_ih - shift->entire + 1,
+ from_ih - shift->entire + 1,
+ shift->entire * sizeof(item_header40));
+ /* update item header offset */
+ old_offset = ih40_get_offset(from_ih);
+ /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
+ for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
+ ih40_set_offset(to_ih,
+ ih40_get_offset(from_ih) -
+ old_offset + free_space_start);
+
+ /* copy item bodies */
+ memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset, /*ih40_get_offset (from_ih), */
+ shift->entire_bytes);
+
+ coord_add_item_pos(&from, (int)shift->entire);
+ coord_add_item_pos(&to, (int)shift->entire);
+ }
+
+ nh40_set_free_space_start(nh,
+ free_space_start +
+ shift->shift_bytes -
+ shift->merging_bytes);
+ nh40_set_free_space(nh,
+ nh40_get_free_space(nh) -
+ (shift->shift_bytes - shift->merging_bytes +
+ sizeof(item_header40) * new_items));
+
+ /* update node header */
+ node40_set_num_items(shift->target, nh, old_items + new_items);
+ assert("vs-170",
+ nh40_get_free_space(nh) < znode_size(shift->target));
+
+ if (shift->part_units) {
+ /* copy heading part (@part units) of @source item as
+ a new item into @target->node */
+
+ /* copy item header of partially copied item */
+ coord_set_item_pos(&to,
+ node40_num_of_items_internal(to.node)
+ - 1);
+ memcpy(to_ih, from_ih, sizeof(item_header40));
+ ih40_set_offset(to_ih,
+ nh40_get_free_space_start(nh) -
+ shift->part_bytes);
+ if (item_plugin_by_coord(&to)->b.init)
+ item_plugin_by_coord(&to)->b.init(&to, &from,
+ NULL);
+ copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
+ shift->part_bytes);
+ }
+
+ } else {
+ /* copying to right */
+
+ coord_set_item_pos(&from,
+ node40_num_of_items_internal(from.node) - 1);
+ from_ih = node40_ih_at_coord(&from);
+
+ coord_set_item_pos(&to, 0);
+
+ /* prepare space for new items */
+ memmove(zdata(to.node) + node_header_size +
+ shift->shift_bytes,
+ zdata(to.node) + node_header_size,
+ free_space_start - node_header_size);
+ /* update item headers of moved items */
+ to_ih = node40_ih_at(to.node, 0);
+ /* first item gets @merging_bytes longer. free space appears
+ at its beginning */
+ if (!node_is_empty(to.node))
+ ih40_set_offset(to_ih,
+ ih40_get_offset(to_ih) +
+ shift->shift_bytes -
+ shift->merging_bytes);
+
+ for (i = 1; i < old_items; i++)
+ ih40_set_offset(to_ih - i,
+ ih40_get_offset(to_ih - i) +
+ shift->shift_bytes);
+
+ /* move item headers to make space for new items */
+ memmove(to_ih - old_items + 1 - new_items,
+ to_ih - old_items + 1,
+ sizeof(item_header40) * old_items);
+ to_ih -= (new_items - 1);
+
+ nh40_set_free_space_start(nh,
+ free_space_start +
+ shift->shift_bytes);
+ nh40_set_free_space(nh,
+ nh40_get_free_space(nh) -
+ (shift->shift_bytes +
+ sizeof(item_header40) * new_items));
+
+ /* update node header */
+ node40_set_num_items(shift->target, nh, old_items + new_items);
+ assert("vs-170",
+ nh40_get_free_space(nh) < znode_size(shift->target));
+
+ if (shift->merging_units) {
+ coord_add_item_pos(&to, new_items);
+ to.unit_pos = 0;
+ to.between = AT_UNIT;
+ /* prepend first item of @to */
+ copy_units(&to, &from,
+ coord_last_unit_pos(&from) -
+ shift->merging_units + 1,
+ shift->merging_units, SHIFT_RIGHT,
+ shift->merging_bytes);
+ coord_dec_item_pos(&from);
+ from_ih++;
+ }
+
+ if (shift->entire) {
+ /* copy @entire items entirely */
+
+ /* copy item headers */
+ memcpy(to_ih, from_ih,
+ shift->entire * sizeof(item_header40));
+
+ /* update item header offset */
+ old_offset =
+ ih40_get_offset(from_ih + shift->entire - 1);
+ /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
+ for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
+ ih40_set_offset(to_ih,
+ ih40_get_offset(from_ih) -
+ old_offset +
+ node_header_size +
+ shift->part_bytes);
+ /* copy item bodies */
+ coord_add_item_pos(&from, -(int)(shift->entire - 1));
+ memcpy(zdata(to.node) + node_header_size +
+ shift->part_bytes, item_by_coord_node40(&from),
+ shift->entire_bytes);
+ coord_dec_item_pos(&from);
+ }
+
+ if (shift->part_units) {
+ coord_set_item_pos(&to, 0);
+ to.unit_pos = 0;
+ to.between = AT_UNIT;
+ /* copy heading part (@part units) of @source item as
+ a new item into @target->node */
+
+ /* copy item header of partially copied item */
+ memcpy(to_ih, from_ih, sizeof(item_header40));
+ ih40_set_offset(to_ih, node_header_size);
+ if (item_plugin_by_coord(&to)->b.init)
+ item_plugin_by_coord(&to)->b.init(&to, &from,
+ NULL);
+ copy_units(&to, &from,
+ coord_last_unit_pos(&from) -
+ shift->part_units + 1, shift->part_units,
+ SHIFT_RIGHT, shift->part_bytes);
+ }
+ }
+}
+
+/* remove everything either before or after @fact_stop. Number of items
+ removed completely is returned */
+static int delete_copied(struct shift_params *shift)
+{
+ coord_t from;
+ coord_t to;
+ struct carry_cut_data cdata;
+
+ if (shift->pend == SHIFT_LEFT) {
+ /* we were shifting to left, remove everything from the
+ beginning of @shift->wish_stop->node upto
+ @shift->wish_stop */
+ coord_init_first_unit(&from, shift->real_stop.node);
+ to = shift->real_stop;
+
+ /* store old coordinate of unit which will be first after
+ shift to left */
+ shift->u.future_first = to;
+ coord_next_unit(&shift->u.future_first);
+ } else {
+ /* we were shifting to right, remove everything from
+ @shift->stop_coord upto to end of
+ @shift->stop_coord->node */
+ from = shift->real_stop;
+ coord_init_last_unit(&to, from.node);
+
+ /* store old coordinate of unit which will be last after
+ shift to right */
+ shift->u.future_last = from;
+ coord_prev_unit(&shift->u.future_last);
+ }
+
+ cdata.params.from = &from;
+ cdata.params.to = &to;
+ cdata.params.from_key = NULL;
+ cdata.params.to_key = NULL;
+ cdata.params.smallest_removed = NULL;
+ return cut_node40(&cdata, NULL);
+}
+
+/* something was moved between @left and @right. Add carry operation to @info
+ list to have carry to update delimiting key between them */
+static int
+prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
+{
+ carry_op *op;
+ carry_node *cn;
+
+ if (info == NULL)
+ /* nowhere to send operation to. */
+ return 0;
+
+ if (!should_notify_parent(right))
+ return 0;
+
+ op = node_post_carry(info, COP_UPDATE, right, 1);
+ if (IS_ERR(op) || op == NULL)
+ return op ? PTR_ERR(op) : -EIO;
+
+ if (left != NULL) {
+ carry_node *reference;
+
+ if (info->doing)
+ reference = insert_carry_node(info->doing,
+ info->todo, left);
+ else
+ reference = op->node;
+ assert("nikita-2992", reference != NULL);
+ cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
+ if (IS_ERR(cn))
+ return PTR_ERR(cn);
+ cn->parent = 1;
+ cn->node = left;
+ if (ZF_ISSET(left, JNODE_ORPHAN))
+ cn->left_before = 1;
+ op->u.update.left = cn;
+ } else
+ op->u.update.left = NULL;
+ return 0;
+}
+
+/* plugin->u.node.prepare_removal
+ to delete a pointer to @empty from the tree add corresponding carry
+ operation (delete) to @info list */
+int prepare_removal_node40(znode * empty, carry_plugin_info * info)
+{
+ carry_op *op;
+ reiser4_tree *tree;
+
+ if (!should_notify_parent(empty))
+ return 0;
+ /* already on a road to Styx */
+ if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
+ return 0;
+ op = node_post_carry(info, COP_DELETE, empty, 1);
+ if (IS_ERR(op) || op == NULL)
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+
+ op->u.delete.child = NULL;
+ op->u.delete.flags = 0;
+
+ /* fare thee well */
+ tree = znode_get_tree(empty);
+ read_lock_tree();
+ write_lock_dk(tree);
+ znode_set_ld_key(empty, znode_get_rd_key(empty));
+ if (znode_is_left_connected(empty) && empty->left)
+ znode_set_rd_key(empty->left, znode_get_rd_key(empty));
+ write_unlock_dk(tree);
+ read_unlock_tree();
+
+ ZF_SET(empty, JNODE_HEARD_BANSHEE);
+ return 0;
+}
+
+/* something were shifted from @insert_coord->node to @shift->target, update
+ @insert_coord correspondingly */
+static void
+adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
+ int including_insert_coord)
+{
+ /* item plugin was invalidated by shifting */
+ coord_clear_iplug(insert_coord);
+
+ if (node_is_empty(shift->wish_stop.node)) {
+ assert("vs-242", shift->everything);
+ if (including_insert_coord) {
+ if (shift->pend == SHIFT_RIGHT) {
+ /* set @insert_coord before first unit of
+ @shift->target node */
+ coord_init_before_first_item(insert_coord,
+ shift->target);
+ } else {
+ /* set @insert_coord after last in target node */
+ coord_init_after_last_item(insert_coord,
+ shift->target);
+ }
+ } else {
+ /* set @insert_coord inside of empty node. There is
+ only one possible coord within an empty
+ node. init_first_unit will set that coord */
+ coord_init_first_unit(insert_coord,
+ shift->wish_stop.node);
+ }
+ return;
+ }
+
+ if (shift->pend == SHIFT_RIGHT) {
+ /* there was shifting to right */
+ if (shift->everything) {
+ /* everything wanted was shifted */
+ if (including_insert_coord) {
+ /* @insert_coord is set before first unit of
+ @to node */
+ coord_init_before_first_item(insert_coord,
+ shift->target);
+ insert_coord->between = BEFORE_UNIT;
+ } else {
+ /* @insert_coord is set after last unit of
+ @insert->node */
+ coord_init_last_unit(insert_coord,
+ shift->wish_stop.node);
+ insert_coord->between = AFTER_UNIT;
+ }
+ }
+ return;
+ }
+
+ /* there was shifting to left */
+ if (shift->everything) {
+ /* everything wanted was shifted */
+ if (including_insert_coord) {
+ /* @insert_coord is set after last unit in @to node */
+ coord_init_after_last_item(insert_coord, shift->target);
+ } else {
+ /* @insert_coord is set before first unit in the same
+ node */
+ coord_init_before_first_item(insert_coord,
+ shift->wish_stop.node);
+ }
+ return;
+ }
+
+ /* FIXME-VS: the code below is complicated because with between ==
+ AFTER_ITEM unit_pos is set to 0 */
+
+ if (!removed) {
+ /* no items were shifted entirely */
+ assert("vs-195", shift->merging_units == 0
+ || shift->part_units == 0);
+
+ if (shift->real_stop.item_pos == insert_coord->item_pos) {
+ if (shift->merging_units) {
+ if (insert_coord->between == AFTER_UNIT) {
+ assert("nikita-1441",
+ insert_coord->unit_pos >=
+ shift->merging_units);
+ insert_coord->unit_pos -=
+ shift->merging_units;
+ } else if (insert_coord->between == BEFORE_UNIT) {
+ assert("nikita-2090",
+ insert_coord->unit_pos >
+ shift->merging_units);
+ insert_coord->unit_pos -=
+ shift->merging_units;
+ }
+
+ assert("nikita-2083",
+ insert_coord->unit_pos + 1);
+ } else {
+ if (insert_coord->between == AFTER_UNIT) {
+ assert("nikita-1442",
+ insert_coord->unit_pos >=
+ shift->part_units);
+ insert_coord->unit_pos -=
+ shift->part_units;
+ } else if (insert_coord->between == BEFORE_UNIT) {
+ assert("nikita-2089",
+ insert_coord->unit_pos >
+ shift->part_units);
+ insert_coord->unit_pos -=
+ shift->part_units;
+ }
+
+ assert("nikita-2084",
+ insert_coord->unit_pos + 1);
+ }
+ }
+ return;
+ }
+
+ /* we shifted to left and there was no enough space for everything */
+ switch (insert_coord->between) {
+ case AFTER_UNIT:
+ case BEFORE_UNIT:
+ if (shift->real_stop.item_pos == insert_coord->item_pos)
+ insert_coord->unit_pos -= shift->part_units;
+ /* fall through */
+ case AFTER_ITEM:
+ coord_add_item_pos(insert_coord, -removed);
+ break;
+ default:
+ impossible("nikita-2087", "not ready");
+ }
+ assert("nikita-2085", insert_coord->unit_pos + 1);
+}
+
+static int call_shift_hooks(struct shift_params *shift)
+{
+ unsigned i, shifted;
+ coord_t coord;
+ item_plugin *iplug;
+
+ assert("vs-275", !node_is_empty(shift->target));
+
+ /* number of items shift touches */
+ shifted =
+ shift->entire + (shift->merging_units ? 1 : 0) +
+ (shift->part_units ? 1 : 0);
+
+ if (shift->pend == SHIFT_LEFT) {
+ /* moved items are at the end */
+ coord_init_last_unit(&coord, shift->target);
+ coord.unit_pos = 0;
+
+ assert("vs-279", shift->pend == 1);
+ for (i = 0; i < shifted; i++) {
+ unsigned from, count;
+
+ iplug = item_plugin_by_coord(&coord);
+ if (i == 0 && shift->part_units) {
+ assert("vs-277",
+ coord_num_units(&coord) ==
+ shift->part_units);
+ count = shift->part_units;
+ from = 0;
+ } else if (i == shifted - 1 && shift->merging_units) {
+ count = shift->merging_units;
+ from = coord_num_units(&coord) - count;
+ } else {
+ count = coord_num_units(&coord);
+ from = 0;
+ }
+
+ if (iplug->b.shift_hook) {
+ iplug->b.shift_hook(&coord, from, count,
+ shift->wish_stop.node);
+ }
+ coord_add_item_pos(&coord, -shift->pend);
+ }
+ } else {
+ /* moved items are at the beginning */
+ coord_init_first_unit(&coord, shift->target);
+
+ assert("vs-278", shift->pend == -1);
+ for (i = 0; i < shifted; i++) {
+ unsigned from, count;
+
+ iplug = item_plugin_by_coord(&coord);
+ if (i == 0 && shift->part_units) {
+ assert("vs-277",
+ coord_num_units(&coord) ==
+ shift->part_units);
+ count = coord_num_units(&coord);
+ from = 0;
+ } else if (i == shifted - 1 && shift->merging_units) {
+ count = shift->merging_units;
+ from = 0;
+ } else {
+ count = coord_num_units(&coord);
+ from = 0;
+ }
+
+ if (iplug->b.shift_hook) {
+ iplug->b.shift_hook(&coord, from, count,
+ shift->wish_stop.node);
+ }
+ coord_add_item_pos(&coord, -shift->pend);
+ }
+ }
+
+ return 0;
+}
+
+/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
+static int
+unit_moved_left(const struct shift_params *shift, const coord_t * old)
+{
+ assert("vs-944", shift->real_stop.node == old->node);
+
+ if (shift->real_stop.item_pos < old->item_pos)
+ return 0;
+ if (shift->real_stop.item_pos == old->item_pos) {
+ if (shift->real_stop.unit_pos < old->unit_pos)
+ return 0;
+ }
+ return 1;
+}
+
+/* shift to right is completed. Return 1 if unit @old was moved to right
+ neighbor */
+static int
+unit_moved_right(const struct shift_params *shift, const coord_t * old)
+{
+ assert("vs-944", shift->real_stop.node == old->node);
+
+ if (shift->real_stop.item_pos > old->item_pos)
+ return 0;
+ if (shift->real_stop.item_pos == old->item_pos) {
+ if (shift->real_stop.unit_pos > old->unit_pos)
+ return 0;
+ }
+ return 1;
+}
+
+/* coord @old was set in node from which shift was performed. What was shifted
+ is stored in @shift. Update @old correspondingly to performed shift */
+static coord_t *adjust_coord2(const struct shift_params *shift,
+ const coord_t * old, coord_t * new)
+{
+ coord_clear_iplug(new);
+ new->between = old->between;
+
+ coord_clear_iplug(new);
+ if (old->node == shift->target) {
+ if (shift->pend == SHIFT_LEFT) {
+ /* coord which is set inside of left neighbor does not
+ change during shift to left */
+ coord_dup(new, old);
+ return new;
+ }
+ new->node = old->node;
+ coord_set_item_pos(new,
+ old->item_pos + shift->entire +
+ (shift->part_units ? 1 : 0));
+ new->unit_pos = old->unit_pos;
+ if (old->item_pos == 0 && shift->merging_units)
+ new->unit_pos += shift->merging_units;
+ return new;
+ }
+
+ assert("vs-977", old->node == shift->wish_stop.node);
+ if (shift->pend == SHIFT_LEFT) {
+ if (unit_moved_left(shift, old)) {
+ /* unit @old moved to left neighbor. Calculate its
+ coordinate there */
+ new->node = shift->target;
+ coord_set_item_pos(new,
+ node_num_items(shift->target) -
+ shift->entire -
+ (shift->part_units ? 1 : 0) +
+ old->item_pos);
+
+ new->unit_pos = old->unit_pos;
+ if (shift->merging_units) {
+ coord_dec_item_pos(new);
+ if (old->item_pos == 0) {
+ /* unit_pos only changes if item got
+ merged */
+ new->unit_pos =
+ coord_num_units(new) -
+ (shift->merging_units -
+ old->unit_pos);
+ }
+ }
+ } else {
+ /* unit @old did not move to left neighbor.
+
+ Use _nocheck, because @old is outside of its node.
+ */
+ coord_dup_nocheck(new, old);
+ coord_add_item_pos(new,
+ -shift->u.future_first.item_pos);
+ if (new->item_pos == 0)
+ new->unit_pos -= shift->u.future_first.unit_pos;
+ }
+ } else {
+ if (unit_moved_right(shift, old)) {
+ /* unit @old moved to right neighbor */
+ new->node = shift->target;
+ coord_set_item_pos(new,
+ old->item_pos -
+ shift->real_stop.item_pos);
+ if (new->item_pos == 0) {
+ /* unit @old might change unit pos */
+ coord_set_item_pos(new,
+ old->unit_pos -
+ shift->real_stop.unit_pos);
+ }
+ } else {
+ /* unit @old did not move to right neighbor, therefore
+ it did not change */
+ coord_dup(new, old);
+ }
+ }
+ coord_set_iplug(new, item_plugin_by_coord(new));
+ return new;
+}
+
+/* this is called when shift is completed (something of source node is copied
+ to target and deleted in source) to update all taps set in current
+ context */
+static void update_taps(const struct shift_params *shift)
+{
+ tap_t *tap;
+ coord_t new;
+
+ for_all_taps(tap) {
+ /* update only taps set to nodes participating in shift */
+ if (tap->coord->node == shift->wish_stop.node
+ || tap->coord->node == shift->target)
+ tap_to_coord(tap,
+ adjust_coord2(shift, tap->coord, &new));
+ }
+}
+
+#if REISER4_DEBUG
+
+struct shift_check {
+ reiser4_key key;
+ __u16 plugin_id;
+ union {
+ __u64 bytes;
+ __u64 entries;
+ void *unused;
+ } u;
+};
+
+void *shift_check_prepare(const znode * left, const znode * right)
+{
+ pos_in_node_t i, nr_items;
+ int mergeable;
+ struct shift_check *data;
+ item_header40 *ih;
+
+ if (node_is_empty(left) || node_is_empty(right))
+ mergeable = 0;
+ else {
+ coord_t l, r;
+
+ coord_init_last_unit(&l, left);
+ coord_init_first_unit(&r, right);
+ mergeable = are_items_mergeable(&l, &r);
+ }
+ nr_items =
+ node40_num_of_items_internal(left) +
+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
+ data =
+ kmalloc(sizeof(struct shift_check) * nr_items,
+ reiser4_ctx_gfp_mask_get());
+ if (data != NULL) {
+ coord_t coord;
+ pos_in_node_t item_pos;
+
+ coord_init_first_unit(&coord, left);
+ i = 0;
+
+ for (item_pos = 0;
+ item_pos < node40_num_of_items_internal(left);
+ item_pos++) {
+
+ coord_set_item_pos(&coord, item_pos);
+ ih = node40_ih_at_coord(&coord);
+
+ data[i].key = ih->key;
+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
+ switch (data[i].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ data[i].u.bytes = coord_num_units(&coord);
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ data[i].u.bytes =
+ reiser4_extent_size(&coord);
+ break;
+ case COMPOUND_DIR_ID:
+ data[i].u.entries = coord_num_units(&coord);
+ break;
+ default:
+ data[i].u.unused = NULL;
+ break;
+ }
+ i++;
+ }
+
+ coord_init_first_unit(&coord, right);
+
+ if (mergeable) {
+ assert("vs-1609", i != 0);
+
+ ih = node40_ih_at_coord(&coord);
+
+ assert("vs-1589",
+ data[i - 1].plugin_id ==
+ le16_to_cpu(get_unaligned(&ih->plugin_id)));
+ switch (data[i - 1].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ data[i - 1].u.bytes += coord_num_units(&coord);
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ data[i - 1].u.bytes +=
+ reiser4_extent_size(&coord);
+ break;
+ case COMPOUND_DIR_ID:
+ data[i - 1].u.entries +=
+ coord_num_units(&coord);
+ break;
+ default:
+ impossible("vs-1605", "wrong mergeable item");
+ break;
+ }
+ item_pos = 1;
+ } else
+ item_pos = 0;
+ for (; item_pos < node40_num_of_items_internal(right);
+ item_pos++) {
+
+ assert("vs-1604", i < nr_items);
+ coord_set_item_pos(&coord, item_pos);
+ ih = node40_ih_at_coord(&coord);
+
+ data[i].key = ih->key;
+ data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
+ switch (data[i].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ data[i].u.bytes = coord_num_units(&coord);
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ data[i].u.bytes =
+ reiser4_extent_size(&coord);
+ break;
+ case COMPOUND_DIR_ID:
+ data[i].u.entries = coord_num_units(&coord);
+ break;
+ default:
+ data[i].u.unused = NULL;
+ break;
+ }
+ i++;
+ }
+ assert("vs-1606", i == nr_items);
+ }
+ return data;
+}
+
+void shift_check(void *vp, const znode * left, const znode * right)
+{
+ pos_in_node_t i, nr_items;
+ coord_t coord;
+ __u64 last_bytes;
+ int mergeable;
+ item_header40 *ih;
+ pos_in_node_t item_pos;
+ struct shift_check *data;
+
+ data = (struct shift_check *)vp;
+
+ if (data == NULL)
+ return;
+
+ if (node_is_empty(left) || node_is_empty(right))
+ mergeable = 0;
+ else {
+ coord_t l, r;
+
+ coord_init_last_unit(&l, left);
+ coord_init_first_unit(&r, right);
+ mergeable = are_items_mergeable(&l, &r);
+ }
+
+ nr_items =
+ node40_num_of_items_internal(left) +
+ node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
+
+ i = 0;
+ last_bytes = 0;
+
+ coord_init_first_unit(&coord, left);
+
+ for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
+ item_pos++) {
+
+ coord_set_item_pos(&coord, item_pos);
+ ih = node40_ih_at_coord(&coord);
+
+ assert("vs-1611", i == item_pos);
+ assert("vs-1590", keyeq(&ih->key, &data[i].key));
+ assert("vs-1591",
+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
+ if ((i < (node40_num_of_items_internal(left) - 1))
+ || !mergeable) {
+ switch (data[i].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ assert("vs-1592",
+ data[i].u.bytes ==
+ coord_num_units(&coord));
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ assert("vs-1593",
+ data[i].u.bytes ==
+ reiser4_extent_size(&coord));
+ break;
+ case COMPOUND_DIR_ID:
+ assert("vs-1594",
+ data[i].u.entries ==
+ coord_num_units(&coord));
+ break;
+ default:
+ break;
+ }
+ }
+ if (item_pos == (node40_num_of_items_internal(left) - 1)
+ && mergeable) {
+ switch (data[i].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ last_bytes = coord_num_units(&coord);
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ last_bytes = reiser4_extent_size(&coord);
+ break;
+ case COMPOUND_DIR_ID:
+ last_bytes = coord_num_units(&coord);
+ break;
+ default:
+ impossible("vs-1595", "wrong mergeable item");
+ break;
+ }
+ }
+ i++;
+ }
+
+ coord_init_first_unit(&coord, right);
+ if (mergeable) {
+ ih = node40_ih_at_coord(&coord);
+
+ assert("vs-1589",
+ data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
+ assert("vs-1608", last_bytes != 0);
+ switch (data[i - 1].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ assert("vs-1596",
+ data[i - 1].u.bytes ==
+ last_bytes + coord_num_units(&coord));
+ break;
+
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ assert("vs-1597",
+ data[i - 1].u.bytes ==
+ last_bytes + reiser4_extent_size(&coord));
+ break;
+
+ case COMPOUND_DIR_ID:
+ assert("vs-1598",
+ data[i - 1].u.bytes ==
+ last_bytes + coord_num_units(&coord));
+ break;
+ default:
+ impossible("vs-1599", "wrong mergeable item");
+ break;
+ }
+ item_pos = 1;
+ } else
+ item_pos = 0;
+
+ for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
+
+ coord_set_item_pos(&coord, item_pos);
+ ih = node40_ih_at_coord(&coord);
+
+ assert("vs-1612", keyeq(&ih->key, &data[i].key));
+ assert("vs-1613",
+ le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
+ switch (data[i].plugin_id) {
+ case CTAIL_ID:
+ case FORMATTING_ID:
+ assert("vs-1600",
+ data[i].u.bytes == coord_num_units(&coord));
+ break;
+ case EXTENT40_POINTER_ID:
+ case EXTENT41_POINTER_ID:
+ assert("vs-1601",
+ data[i].u.bytes == reiser4_extent_size(&coord));
+ break;
+ case COMPOUND_DIR_ID:
+ assert("vs-1602",
+ data[i].u.entries == coord_num_units(&coord));
+ break;
+ default:
+ break;
+ }
+ i++;
+ }
+
+ assert("vs-1603", i == nr_items);
+ kfree(data);
+}
+
+#endif
+
+/*
+ * common part of ->shift() for all nodes,
+ * which contain node40_header at the beginning and
+ * the table of item headers at the end
+ */
+int shift_node40_common(coord_t *from, znode *to,
+ shift_direction pend,
+ int delete_child, /* if @from->node becomes empty,
+ * it will be deleted from the
+ * tree if this is set to 1 */
+ int including_stop_coord,
+ carry_plugin_info *info,
+ size_t node_header_size)
+{
+ struct shift_params shift;
+ int result;
+ znode *left, *right;
+ znode *source;
+ int target_empty;
+
+ assert("nikita-2161", coord_check(from));
+
+ memset(&shift, 0, sizeof(shift));
+ shift.pend = pend;
+ shift.wish_stop = *from;
+ shift.target = to;
+
+ assert("nikita-1473", znode_is_write_locked(from->node));
+ assert("nikita-1474", znode_is_write_locked(to));
+
+ source = from->node;
+
+ /* set @shift.wish_stop to rightmost/leftmost unit among units we want
+ shifted */
+ if (pend == SHIFT_LEFT) {
+ result = coord_set_to_left(&shift.wish_stop);
+ left = to;
+ right = from->node;
+ } else {
+ result = coord_set_to_right(&shift.wish_stop);
+ left = from->node;
+ right = to;
+ }
+
+ if (result) {
+ /* move insertion coord even if there is nothing to move */
+ if (including_stop_coord) {
+ /* move insertion coord (@from) */
+ if (pend == SHIFT_LEFT) {
+ /* after last item in target node */
+ coord_init_after_last_item(from, to);
+ } else {
+ /* before first item in target node */
+ coord_init_before_first_item(from, to);
+ }
+ }
+
+ if (delete_child && node_is_empty(shift.wish_stop.node))
+ result =
+ prepare_removal_node40(shift.wish_stop.node, info);
+ else
+ result = 0;
+ /* there is nothing to shift */
+ assert("nikita-2078", coord_check(from));
+ return result;
+ }
+
+ target_empty = node_is_empty(to);
+
+ /* when first node plugin with item body compression is implemented,
+ this must be changed to call node specific plugin */
+
+ /* shift->stop_coord is updated to last unit which really will be
+ shifted */
+ estimate_shift(&shift, get_current_context());
+ if (!shift.shift_bytes) {
+ /* we could not shift anything */
+ assert("nikita-2079", coord_check(from));
+ return 0;
+ }
+
+ copy(&shift, node_header_size);
+
+ /* result value of this is important. It is used by adjust_coord below */
+ result = delete_copied(&shift);
+
+ assert("vs-1610", result >= 0);
+ assert("vs-1471",
+ ((reiser4_context *) current->journal_info)->magic ==
+ context_magic);
+
+ /* item which has been moved from one node to another might want to do
+ something on that event. This can be done by item's shift_hook
+ method, which will be now called for every moved items */
+ call_shift_hooks(&shift);
+
+ assert("vs-1472",
+ ((reiser4_context *) current->journal_info)->magic ==
+ context_magic);
+
+ update_taps(&shift);
+
+ assert("vs-1473",
+ ((reiser4_context *) current->journal_info)->magic ==
+ context_magic);
+
+ /* adjust @from pointer in accordance with @including_stop_coord flag
+ and amount of data which was really shifted */
+ adjust_coord(from, &shift, result, including_stop_coord);
+
+ if (target_empty)
+ /*
+ * items were shifted into empty node. Update delimiting key.
+ */
+ result = prepare_for_update(NULL, left, info);
+
+ /* add update operation to @info, which is the list of operations to
+ be performed on a higher level */
+ result = prepare_for_update(left, right, info);
+ if (!result && node_is_empty(source) && delete_child) {
+ /* all contents of @from->node is moved to @to and @from->node
+ has to be removed from the tree, so, on higher level we
+ will be removing the pointer to node @from->node */
+ result = prepare_removal_node40(source, info);
+ }
+ assert("nikita-2080", coord_check(from));
+ return result ? result : (int)shift.shift_bytes;
+}
+
+/*
+ * plugin->u.node.shift
+ * look for description of this method in plugin/node/node.h
+ */
+int shift_node40(coord_t *from, znode *to,
+ shift_direction pend,
+ int delete_child, /* if @from->node becomes empty,
+ * it will be deleted from the
+ * tree if this is set to 1 */
+ int including_stop_coord,
+ carry_plugin_info *info)
+{
+ return shift_node40_common(from, to, pend, delete_child,
+ including_stop_coord, info,
+ sizeof(node40_header));
+}
+
+/* plugin->u.node.fast_insert()
+ look for description of this method in plugin/node/node.h */
+int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
+{
+ return 1;
+}
+
+/* plugin->u.node.fast_paste()
+ look for description of this method in plugin/node/node.h */
+int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
+{
+ return 1;
+}
+
+/* plugin->u.node.fast_cut()
+ look for description of this method in plugin/node/node.h */
+int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
+{
+ return 1;
+}
+
+/* plugin->u.node.modify - not defined */
+
+/* plugin->u.node.max_item_size */
+int max_item_size_node40(void)
+{
+ return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
+ sizeof(item_header40);
+}
+
+/* plugin->u.node.set_item_plugin */
+int set_item_plugin_node40(coord_t *coord, item_id id)
+{
+ item_header40 *ih;
+
+ ih = node40_ih_at_coord(coord);
+ put_unaligned(cpu_to_le16(id), &ih->plugin_id);
+ coord->iplugid = id;
+ return 0;
+}
+
+/*
+ * Merge neighboring items @left and @right located on the same node.
+ * In the result number of items in the node gets always decremented.
+ */
+void merge_items_node40(coord_t *left, coord_t *right)
+{
+ znode *node;
+ node40_header *nh;
+ item_header40 *ih;
+ size_t freed = 0;
+#if REISER4_DEBUG
+ //const char *error;
+ int units_before_merge;
+
+ assert("edward-2077", left->node == right->node);
+ assert("edward-2078", coord_is_existing_item(left));
+ assert("edward-2079", coord_is_existing_item(right));
+ assert("edward-2080", right->item_pos == left->item_pos + 1);
+ assert("edward-2081",
+ plugin_by_coord_node40(left) ==
+ plugin_by_coord_node40(right));
+ assert("edward-2082", are_items_mergeable(left, right));
+
+ units_before_merge = coord_num_units(left) + coord_num_units(right);
+#endif
+ node = left->node;
+ nh = node40_node_header(node);
+ /*
+ * Try to merge units at the junction. It may release some space.
+ */
+ if (plugin_by_coord_node40(left)->b.merge_units)
+ freed = plugin_by_coord_node40(left)->b.merge_units(left,
+ right);
+ if (freed && nh40_get_num_items(nh) > right->item_pos + 1) {
+ /*
+ * Move bodies of all items at the right of @right to the left
+ */
+ int i;
+ char *tail;
+ size_t tail_size;
+
+ ih = node40_ih_at(node, right->item_pos + 1);
+ tail = zdata(node) + ih40_get_offset(ih);
+ tail_size = nh40_get_free_space_start(nh) - ih40_get_offset(ih);
+ memmove(tail - freed, tail, tail_size);
+ /*
+ * Update offsets of moved items
+ */
+ for (i = right->item_pos + 1;
+ i < nh40_get_num_items(nh); i++) {
+ ih = node40_ih_at(node, i);
+ ih40_set_offset(ih, ih40_get_offset(ih) - freed);
+ }
+ }
+ /*
+ * Remove all records about the @right from the node.
+ *
+ * Move all item headers at the left from @ih_right
+ * one position to the @right.
+ */
+ ih = node40_ih_at(node, nh40_get_num_items(nh) - 1);
+ memmove(ih + 1, ih, sizeof(item_header40) *
+ (nh40_get_num_items(nh) - (right->item_pos + 1)));
+ /*
+ * update_node_header
+ */
+ nh40_set_free_space(nh, nh40_get_free_space(nh) + freed +
+ sizeof(item_header40));
+ nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
+ node40_set_num_items(node, nh, nh40_get_num_items(nh) - 1);
+#if 0
+ assert("edward-2083",
+ check_node40(node, REISER4_NODE_TREE_STABLE, &error) == 0);
+#endif
+ assert("edward-2133", coord_num_units(left) ==
+ freed ? units_before_merge - 1 : units_before_merge);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node40.h linux-5.10.2/fs/reiser4/plugin/node/node40.h
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node40.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node40.h 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,131 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined( __REISER4_NODE40_H__ )
+#define __REISER4_NODE40_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "node.h"
+
+#include <linux/types.h>
+
+/* format of node header for 40 node layouts. Keep bloat out of this struct. */
+typedef struct node40_header {
+ /* identifier of node plugin. Must be located at the very beginning
+ of a node. */
+ common_node_header common_header; /* this is 16 bits */
+ /* number of items. Should be first element in the node header,
+ because we haven't yet finally decided whether it shouldn't go into
+ common_header.
+ */
+/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
+ * node format at compile time, and it is this one, accesses do not function dereference when
+ * accessing these fields (and otherwise they do). Probably 80% of users will only have one node format at a time throughout the life of reiser4. */
+ d16 nr_items;
+ /* free space in node measured in bytes */
+ d16 free_space;
+ /* offset to start of free space in node */
+ d16 free_space_start;
+ /* for reiser4_fsck. When information about what is a free
+ block is corrupted, and we try to recover everything even
+ if marked as freed, then old versions of data may
+ duplicate newer versions, and this field allows us to
+ restore the newer version. Also useful for when users
+ who don't have the new trashcan installed on their linux distro
+ delete the wrong files and send us desperate emails
+ offering $25 for them back. */
+
+ /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
+ d32 magic;
+ /* flushstamp is made of mk_id and write_counter. mk_id is an
+ id generated randomly at mkreiserfs time. So we can just
+ skip all nodes with different mk_id. write_counter is d64
+ incrementing counter of writes on disk. It is used for
+ choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
+
+ d32 mkfs_id;
+ d64 flush_id;
+ /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
+ and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
+ d16 flags;
+
+ /* 1 is leaf level, 2 is twig level, root is the numerically
+ largest level */
+ d8 level;
+
+ d8 pad;
+} PACKED node40_header;
+
+/* item headers are not standard across all node layouts, pass
+ pos_in_node to functions instead */
+typedef struct item_header40 {
+ /* key of item */
+ /* 0 */ reiser4_key key;
+ /* offset from start of a node measured in 8-byte chunks */
+ /* 24 */ d16 offset;
+ /* 26 */ d16 flags;
+ /* 28 */ d16 plugin_id;
+} PACKED item_header40;
+
+size_t item_overhead_node40(const znode * node, flow_t * aflow);
+size_t free_space_node40(znode * node);
+node_search_result lookup_node40(znode * node, const reiser4_key * key,
+ lookup_bias bias, coord_t * coord);
+int num_of_items_node40(const znode * node);
+char *item_by_coord_node40(const coord_t * coord);
+int length_by_coord_node40(const coord_t * coord);
+item_plugin *plugin_by_coord_node40(const coord_t * coord);
+reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
+size_t estimate_node40(znode * node);
+int check_node40(const znode * node, __u32 flags, const char **error);
+int parse_node40_common(znode *node, const __u32 magic);
+int parse_node40(znode * node);
+int init_node40_common(znode *node, node_plugin *nplug,
+ size_t node_header_size, const __u32 magic);
+int init_node40(znode *node);
+
+#ifdef GUESS_EXISTS
+int guess_node40_common(const znode *node, reiser4_node_id id,
+ const __u32 magic);
+int guess_node40(const znode *node);
+#endif
+
+void change_item_size_node40(coord_t * coord, int by);
+int create_item_node40(coord_t * target, const reiser4_key * key,
+ reiser4_item_data * data, carry_plugin_info * info);
+void update_item_key_node40(coord_t * target, const reiser4_key * key,
+ carry_plugin_info * info);
+int kill_node40(struct carry_kill_data *, carry_plugin_info *);
+int cut_node40(struct carry_cut_data *, carry_plugin_info *);
+int shift_node40_common(coord_t *from, znode *to, shift_direction pend,
+ int delete_child, int including_stop_coord,
+ carry_plugin_info *info, size_t nh_size);
+int shift_node40(coord_t *from, znode *to, shift_direction pend,
+ int delete_child, int including_stop_coord,
+ carry_plugin_info *info);
+void merge_items_node40(coord_t *left, coord_t *right);
+int fast_insert_node40(const coord_t * coord);
+int fast_paste_node40(const coord_t * coord);
+int fast_cut_node40(const coord_t * coord);
+int max_item_size_node40(void);
+int prepare_removal_node40(znode * empty, carry_plugin_info * info);
+int set_item_plugin_node40(coord_t * coord, item_id id);
+int shrink_item_node40(coord_t * coord, int delta);
+
+#if REISER4_DEBUG
+void *shift_check_prepare(const znode *left, const znode *right);
+void shift_check(void *vp, const znode *left, const znode *right);
+#endif
+
+/* __REISER4_NODE40_H__ */
+#endif
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node41.c linux-5.10.2/fs/reiser4/plugin/node/node41.c
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node41.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node41.c 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README
+ */
+
+#include "../../debug.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "../plugin_header.h"
+#include "../item/item.h"
+#include "node.h"
+#include "node41.h"
+#include "../plugin.h"
+#include "../../jnode.h"
+#include "../../znode.h"
+#include "../../pool.h"
+#include "../../carry.h"
+#include "../../tap.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../../checksum.h"
+#include "../../reiser4.h"
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/prefetch.h>
+
+/*
+ * node41 layout it almost the same as node40:
+ * node41_header is at the beginning and a table of item headers
+ * is at the end. Ther difference is that node41_header contains
+ * a 32-bit checksum (see node41.h)
+ */
+
+static const __u32 REISER4_NODE41_MAGIC = 0x19051966;
+
+static inline node41_header *node41_node_header(const znode *node)
+{
+ assert("edward-1634", node != NULL);
+ assert("edward-1635", znode_page(node) != NULL);
+ assert("edward-1636", zdata(node) != NULL);
+
+ return (node41_header *)zdata(node);
+}
+
+int csum_node41(znode *node, int check)
+{
+ __u32 cpu_csum;
+
+ cpu_csum = reiser4_crc32c(get_current_super_private()->csum_tfm,
+ ~0,
+ zdata(node),
+ sizeof(struct node40_header));
+ cpu_csum = reiser4_crc32c(get_current_super_private()->csum_tfm,
+ cpu_csum,
+ zdata(node) + sizeof(struct node41_header),
+ reiser4_get_current_sb()->s_blocksize -
+ sizeof(node41_header));
+ if (check)
+ return cpu_csum == nh41_get_csum(node41_node_header(node));
+ else {
+ nh41_set_csum(node41_node_header(node), cpu_csum);
+ return 1;
+ }
+}
+
+/*
+ * plugin->u.node.parse
+ * look for description of this method in plugin/node/node.h
+ */
+int parse_node41(znode *node /* node to parse */)
+{
+ int ret;
+
+ ret = csum_node41(node, 1/* check */);
+ if (!ret) {
+ warning("edward-1645",
+ "block %llu (%s): bad checksum. Please, scrub the volume.",
+ *jnode_get_block(ZJNODE(node)),
+ ZJNODE(node)->subvol->name);
+
+ return RETERR(-EIO);
+ }
+ return parse_node40_common(node, REISER4_NODE41_MAGIC);
+}
+
+/*
+ * plugin->u.node.init
+ * look for description of this method in plugin/node/node.h
+ */
+int init_node41(znode *node /* node to initialise */)
+{
+ return init_node40_common(node, node_plugin_by_id(NODE41_ID),
+ sizeof(node41_header), REISER4_NODE41_MAGIC);
+}
+
+/*
+ * plugin->u.node.shift
+ * look for description of this method in plugin/node/node.h
+ */
+int shift_node41(coord_t *from, znode *to,
+ shift_direction pend,
+ int delete_child, /* if @from->node becomes empty,
+ * it will be deleted from the
+ * tree if this is set to 1 */
+ int including_stop_coord,
+ carry_plugin_info *info)
+{
+ return shift_node40_common(from, to, pend, delete_child,
+ including_stop_coord, info,
+ sizeof(node41_header));
+}
+
+#ifdef GUESS_EXISTS
+int guess_node41(const znode *node /* node to guess plugin of */)
+{
+ return guess_node40_common(node, NODE41_ID, REISER4_NODE41_MAGIC);
+}
+#endif
+
+/*
+ * plugin->u.node.max_item_size
+ */
+int max_item_size_node41(void)
+{
+ return reiser4_get_current_sb()->s_blocksize - sizeof(node41_header) -
+ sizeof(item_header40);
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node41.h linux-5.10.2/fs/reiser4/plugin/node/node41.h
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node41.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node41.h 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,50 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined( __REISER4_NODE41_H__ )
+#define __REISER4_NODE41_H__
+
+#include "../../forward.h"
+#include "../../dformat.h"
+#include "node40.h"
+#include <linux/types.h>
+
+/*
+ * node41 layout: the same as node40, but with 32-bit checksum
+ */
+
+typedef struct node41_header {
+ node40_header head;
+ d32 csum;
+} PACKED node41_header;
+
+/*
+ * functions to get/set fields of node41_header
+ */
+#define nh41_get_csum(nh) le32_to_cpu(get_unaligned(&(nh)->csum))
+#define nh41_set_csum(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->csum)
+
+int init_node41(znode * node);
+int parse_node41(znode *node);
+int max_item_size_node41(void);
+int shift_node41(coord_t *from, znode *to, shift_direction pend,
+ int delete_child, int including_stop_coord,
+ carry_plugin_info *info);
+int csum_node41(znode *node, int check);
+
+#ifdef GUESS_EXISTS
+int guess_node41(const znode * node);
+#endif
+extern void reiser4_handle_error(void);
+
+/* __REISER4_NODE41_H__ */
+#endif
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node.c linux-5.10.2/fs/reiser4/plugin/node/node.c
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node.c 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,172 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Node plugin interface.
+
+ Description: The tree provides the abstraction of flows, which it
+ internally fragments into items which it stores in nodes.
+
+ A key_atom is a piece of data bound to a single key.
+
+ For reasonable space efficiency to be achieved it is often
+ necessary to store key_atoms in the nodes in the form of items, where
+ an item is a sequence of key_atoms of the same or similar type. It is
+ more space-efficient, because the item can implement (very)
+ efficient compression of key_atom's bodies using internal knowledge
+ about their semantics, and it can often avoid having a key for each
+ key_atom. Each type of item has specific operations implemented by its
+ item handler (see balance.c).
+
+ Rationale: the rest of the code (specifically balancing routines)
+ accesses leaf level nodes through this interface. This way we can
+ implement various block layouts and even combine various layouts
+ within the same tree. Balancing/allocating algorithms should not
+ care about peculiarities of splitting/merging specific item types,
+ but rather should leave that to the item's item handler.
+
+ Items, including those that provide the abstraction of flows, have
+ the property that if you move them in part or in whole to another
+ node, the balancing code invokes their is_left_mergeable()
+ item_operation to determine if they are mergeable with their new
+ neighbor in the node you have moved them to. For some items the
+ is_left_mergeable() function always returns null.
+
+ When moving the bodies of items from one node to another:
+
+ if a partial item is shifted to another node the balancing code invokes
+ an item handler method to handle the item splitting.
+
+ if the balancing code needs to merge with an item in the node it
+ is shifting to, it will invoke an item handler method to handle
+ the item merging.
+
+ if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
+ adjusting the item headers after the move is done using the node handler.
+*/
+
+#include "../../forward.h"
+#include "../../debug.h"
+#include "../../key.h"
+#include "../../coord.h"
+#include "../plugin_header.h"
+#include "../item/item.h"
+#include "node.h"
+#include "../plugin.h"
+#include "../../znode.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../../reiser4.h"
+
+/**
+ * leftmost_key_in_node - get the smallest key in node
+ * @node:
+ * @key: store result here
+ *
+ * Stores the leftmost key of @node in @key.
+ */
+reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
+{
+ assert("nikita-1634", node != NULL);
+ assert("nikita-1635", key != NULL);
+
+ if (!node_is_empty(node)) {
+ coord_t first_item;
+
+ coord_init_first_unit(&first_item, (znode *) node);
+ item_key_by_coord(&first_item, key);
+ } else
+ *key = *reiser4_max_key();
+ return key;
+}
+
+node_plugin node_plugins[LAST_NODE_ID] = {
+ [NODE40_ID] = {
+ .h = {
+ .type_id = REISER4_NODE_PLUGIN_TYPE,
+ .id = NODE40_ID,
+ .pops = NULL,
+ .label = "unified",
+ .desc = "unified node layout",
+ .linkage = {NULL, NULL}
+ },
+ .item_overhead = item_overhead_node40,
+ .free_space = free_space_node40,
+ .lookup = lookup_node40,
+ .num_of_items = num_of_items_node40,
+ .item_by_coord = item_by_coord_node40,
+ .length_by_coord = length_by_coord_node40,
+ .plugin_by_coord = plugin_by_coord_node40,
+ .key_at = key_at_node40,
+ .estimate = estimate_node40,
+ .check = check_node40,
+ .parse = parse_node40,
+ .init = init_node40,
+#ifdef GUESS_EXISTS
+ .guess = guess_node40,
+#endif
+ .change_item_size = change_item_size_node40,
+ .create_item = create_item_node40,
+ .merge_items = merge_items_node40,
+ .update_item_key = update_item_key_node40,
+ .cut_and_kill = kill_node40,
+ .cut = cut_node40,
+ .shift = shift_node40,
+ .shrink_item = shrink_item_node40,
+ .fast_insert = fast_insert_node40,
+ .fast_paste = fast_paste_node40,
+ .fast_cut = fast_cut_node40,
+ .max_item_size = max_item_size_node40,
+ .prepare_removal = prepare_removal_node40,
+ .set_item_plugin = set_item_plugin_node40
+ },
+ [NODE41_ID] = {
+ .h = {
+ .type_id = REISER4_NODE_PLUGIN_TYPE,
+ .id = NODE41_ID,
+ .pops = NULL,
+ .label = "node41",
+ .desc = "node41 layout",
+ .linkage = {NULL, NULL}
+ },
+ .item_overhead = item_overhead_node40,
+ .free_space = free_space_node40,
+ .lookup = lookup_node40,
+ .num_of_items = num_of_items_node40,
+ .item_by_coord = item_by_coord_node40,
+ .length_by_coord = length_by_coord_node40,
+ .plugin_by_coord = plugin_by_coord_node40,
+ .key_at = key_at_node40,
+ .estimate = estimate_node40,
+ .check = NULL,
+ .parse = parse_node41,
+ .init = init_node41,
+#ifdef GUESS_EXISTS
+ .guess = guess_node41,
+#endif
+ .change_item_size = change_item_size_node40,
+ .create_item = create_item_node40,
+ .merge_items = merge_items_node40,
+ .update_item_key = update_item_key_node40,
+ .cut_and_kill = kill_node40,
+ .cut = cut_node40,
+ .shift = shift_node41,
+ .shrink_item = shrink_item_node40,
+ .fast_insert = fast_insert_node40,
+ .fast_paste = fast_paste_node40,
+ .fast_cut = fast_cut_node40,
+ .max_item_size = max_item_size_node41,
+ .prepare_removal = prepare_removal_node40,
+ .set_item_plugin = set_item_plugin_node40,
+ .csum = csum_node41
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/node/node.h linux-5.10.2/fs/reiser4/plugin/node/node.h
--- linux-5.10.2.orig/fs/reiser4/plugin/node/node.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/node/node.h 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,282 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* We need a definition of the default node layout here. */
+
+/* Generally speaking, it is best to have free space in the middle of the
+ node so that two sets of things can grow towards it, and to have the
+ item bodies on the left so that the last one of them grows into free
+ space. We optimize for the case where we append new items to the end
+ of the node, or grow the last item, because it hurts nothing to so
+ optimize and it is a common special case to do massive insertions in
+ increasing key order (and one of cases more likely to have a real user
+ notice the delay time for).
+
+ formatted leaf default layout: (leaf1)
+
+ |node header:item bodies:free space:key + pluginid + item offset|
+
+ We grow towards the middle, optimizing layout for the case where we
+ append new items to the end of the node. The node header is fixed
+ length. Keys, and item offsets plus pluginids for the items
+ corresponding to them are in increasing key order, and are fixed
+ length. Item offsets are relative to start of node (16 bits creating
+ a node size limit of 64k, 12 bits might be a better choice....). Item
+ bodies are in decreasing key order. Item bodies have a variable size.
+ There is a one to one to one mapping of keys to item offsets to item
+ bodies. Item offsets consist of pointers to the zeroth byte of the
+ item body. Item length equals the start of the next item minus the
+ start of this item, except the zeroth item whose length equals the end
+ of the node minus the start of that item (plus a byte). In other
+ words, the item length is not recorded anywhere, and it does not need
+ to be since it is computable.
+
+ Leaf variable length items and keys layout : (lvar)
+
+ |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
+
+ We grow towards the middle, optimizing layout for the case where we
+ append new items to the end of the node. The node header is fixed
+ length. Keys and item offsets for the items corresponding to them are
+ in increasing key order, and keys are variable length. Item offsets
+ are relative to start of node (16 bits). Item bodies are in
+ decreasing key order. Item bodies have a variable size. There is a
+ one to one to one mapping of keys to item offsets to item bodies.
+ Item offsets consist of pointers to the zeroth byte of the item body.
+ Item length equals the start of the next item's key minus the start of
+ this item, except the zeroth item whose length equals the end of the
+ node minus the start of that item (plus a byte).
+
+ leaf compressed keys layout: (lcomp)
+
+ |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
+
+ We grow towards the middle, optimizing layout for the case where we
+ append new items to the end of the node. The node header is fixed
+ length. Keys and item offsets for the items corresponding to them are
+ in increasing key order, and keys are variable length. The "key
+ inherit" field indicates how much of the key prefix is identical to
+ the previous key (stem compression as described in "Managing
+ Gigabytes" is used). key_inherit is a one byte integer. The
+ intra-node searches performed through this layout are linear searches,
+ and this is theorized to not hurt performance much due to the high
+ cost of processor stalls on modern CPUs, and the small number of keys
+ in a single node. Item offsets are relative to start of node (16
+ bits). Item bodies are in decreasing key order. Item bodies have a
+ variable size. There is a one to one to one mapping of keys to item
+ offsets to item bodies. Item offsets consist of pointers to the
+ zeroth byte of the item body. Item length equals the start of the
+ next item minus the start of this item, except the zeroth item whose
+ length equals the end of the node minus the start of that item (plus a
+ byte). In other words, item length and key length is not recorded
+ anywhere, and it does not need to be since it is computable.
+
+ internal node default layout: (idef1)
+
+ just like ldef1 except that item bodies are either blocknrs of
+ children or extents, and moving them may require updating parent
+ pointers in the nodes that they point to.
+*/
+
+/* There is an inherent 3-way tradeoff between optimizing and
+ exchanging disks between different architectures and code
+ complexity. This is optimal and simple and inexchangeable.
+ Someone else can do the code for exchanging disks and make it
+ complex. It would not be that hard. Using other than the PAGE_SIZE
+ might be suboptimal.
+*/
+
+#if !defined( __REISER4_NODE_H__ )
+#define __REISER4_NODE_H__
+
+#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
+
+#include "../../dformat.h"
+#include "../plugin_header.h"
+
+#include <linux/types.h>
+
+typedef enum {
+ NS_FOUND = 0,
+ NS_NOT_FOUND = -ENOENT
+} node_search_result;
+
+/* Maximal possible space overhead for creation of new item in a node */
+#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
+
+typedef enum {
+ REISER4_NODE_DKEYS = (1 << 0),
+ REISER4_NODE_TREE_STABLE = (1 << 1),
+ REISER4_NODE_CHECK_MERGEABLE = (1 << 2)
+} reiser4_node_check_flag;
+
+/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
+struct cut_list {
+ coord_t *from;
+ coord_t *to;
+ const reiser4_key *from_key;
+ const reiser4_key *to_key;
+ reiser4_key *smallest_removed;
+ carry_plugin_info *info;
+ __u32 flags;
+ struct inode *inode; /* this is to pass list of eflushed jnodes down to extent_kill_hook */
+ lock_handle *left;
+ lock_handle *right;
+};
+
+struct carry_cut_data;
+struct carry_kill_data;
+
+/* The responsibility of the node plugin is to store and give access
+ to the sequence of items within the node. */
+typedef struct node_plugin {
+ /* generic plugin fields */
+ plugin_header h;
+
+ /* calculates the amount of space that will be required to store an
+ item which is in addition to the space consumed by the item body.
+ (the space consumed by the item body can be gotten by calling
+ item->estimate) */
+ size_t(*item_overhead) (const znode * node, flow_t * f);
+
+ /* returns free space by looking into node (i.e., without using
+ znode->free_space). */
+ size_t(*free_space) (znode * node);
+ /* search within the node for the one item which might
+ contain the key, invoking item->search_within to search within
+ that item to see if it is in there */
+ node_search_result(*lookup) (znode * node, const reiser4_key * key,
+ lookup_bias bias, coord_t * coord);
+ /* number of items in node */
+ int (*num_of_items) (const znode * node);
+
+ /* store information about item in @coord in @data */
+ /* break into several node ops, don't add any more uses of this before doing so */
+ /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
+ char *(*item_by_coord) (const coord_t * coord);
+ int (*length_by_coord) (const coord_t * coord);
+ item_plugin *(*plugin_by_coord) (const coord_t * coord);
+
+ /* store item key in @key */
+ reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
+ /* conservatively estimate whether unit of what size can fit
+ into node. This estimation should be performed without
+ actually looking into the node's content (free space is saved in
+ znode). */
+ size_t(*estimate) (znode * node);
+
+ /* performs every consistency check the node plugin author could
+ imagine. Optional. */
+ int (*check) (const znode * node, __u32 flags, const char **error);
+
+ /* Called when node is read into memory and node plugin is
+ already detected. This should read some data into znode (like free
+ space counter) and, optionally, check data consistency.
+ */
+ int (*parse) (znode * node);
+ /* This method is called on a new node to initialise plugin specific
+ data (header, etc.) */
+ int (*init) (znode * node);
+ /* Check whether @node content conforms to this plugin format.
+ Probably only useful after support for old V3.x formats is added.
+ Uncomment after 4.0 only.
+ */
+ /* int ( *guess )( const znode *node ); */
+#if REISER4_DEBUG
+ void (*print) (const char *prefix, const znode * node, __u32 flags);
+#endif
+ /* change size of @item by @by bytes. @item->node has enough free
+ space. When @by > 0 - free space is appended to end of item. When
+ @by < 0 - item is truncated - it is assumed that last @by bytes if
+ the item are freed already */
+ void (*change_item_size) (coord_t * item, int by);
+
+ /* create new item @length bytes long in coord @target */
+ int (*create_item) (coord_t * target, const reiser4_key * key,
+ reiser4_item_data * data, carry_plugin_info * info);
+
+ /* merge two neighboring mergeable items @left and @right
+ located on the same node. Such items can appear after
+ some operations like plugging a hole in a striped file.
+ This operation always increases free space in the node */
+ void (*merge_items) (coord_t *left, coord_t *right);
+
+ /* update key of item. */
+ void (*update_item_key) (coord_t * target, const reiser4_key * key,
+ carry_plugin_info * info);
+
+ int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
+ int (*cut) (struct carry_cut_data *, carry_plugin_info *);
+
+ /*
+ * shrink item pointed to by @coord by @delta bytes.
+ */
+ int (*shrink_item) (coord_t * coord, int delta);
+
+ /* copy as much as possible but not more than up to @stop from
+ @stop->node to @target. If (pend == append) then data from beginning of
+ @stop->node are copied to the end of @target. If (pend == prepend) then
+ data from the end of @stop->node are copied to the beginning of
+ @target. Copied data are removed from @stop->node. Information
+ about what to do on upper level is stored in @todo */
+ int (*shift) (coord_t * stop, znode * target, shift_direction pend,
+ int delete_node, int including_insert_coord,
+ carry_plugin_info * info);
+ /* return true if this node allows skip carry() in some situations
+ (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
+ emulation doesn't.
+
+ This will speedup insertions that doesn't require updates to the
+ parent, by bypassing initialisation of carry() structures. It's
+ believed that majority of insertions will fit there.
+
+ */
+ int (*fast_insert) (const coord_t * coord);
+ int (*fast_paste) (const coord_t * coord);
+ int (*fast_cut) (const coord_t * coord);
+ /* this limits max size of item which can be inserted into a node and
+ number of bytes item in a node may be appended with */
+ int (*max_item_size) (void);
+ int (*prepare_removal) (znode * empty, carry_plugin_info * info);
+ /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
+ * files */
+ int (*set_item_plugin) (coord_t * coord, item_id);
+ /* calculate and check/update znode's checksum
+ (if @check is true, then check, otherwise update) */
+ int (*csum)(znode *node, int check);
+} node_plugin;
+
+typedef enum {
+ NODE40_ID, /* standard unified node layout used for both,
+ leaf and internal nodes */
+ NODE41_ID, /* node layout with a checksum */
+ LAST_NODE_ID
+} reiser4_node_id;
+
+extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
+#if REISER4_DEBUG
+extern void print_node_content(const char *prefix, const znode * node,
+ __u32 flags);
+#endif
+
+extern void indent_znode(const znode * node);
+
+typedef struct common_node_header {
+ /*
+ * identifier of node plugin. Must be located at the very beginning of
+ * a node.
+ */
+ __le16 plugin_id;
+} common_node_header;
+
+/* __REISER4_NODE_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/object.c linux-5.10.2/fs/reiser4/plugin/object.c
--- linux-5.10.2.orig/fs/reiser4/plugin/object.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/object.c 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,618 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/*
+ * Examples of object plugins: file, directory, symlink, special file.
+ *
+ * Plugins associated with inode:
+ *
+ * Plugin of inode is plugin referenced by plugin-id field of on-disk
+ * stat-data. How we store this plugin in in-core inode is not
+ * important. Currently pointers are used, another variant is to store offsets
+ * and do array lookup on each access.
+ *
+ * Now, each inode has one selected plugin: object plugin that
+ * determines what type of file this object is: directory, regular etc.
+ *
+ * This main plugin can use other plugins that are thus subordinated to
+ * it. Directory instance of object plugin uses hash; regular file
+ * instance uses tail policy plugin.
+ *
+ * Object plugin is either taken from id in stat-data or guessed from
+ * i_mode bits. Once it is established we ask it to install its
+ * subordinate plugins, by looking again in stat-data or inheriting them
+ * from parent.
+ *
+ * How new inode is initialized during ->read_inode():
+ * 1 read stat-data and initialize inode fields: i_size, i_mode,
+ * i_generation, capabilities etc.
+ * 2 read plugin id from stat data or try to guess plugin id
+ * from inode->i_mode bits if plugin id is missing.
+ * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
+ *
+ * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3? What
+ * if stat data does contain i_size, etc., due to it being an unusual plugin?
+ *
+ * 4 Call ->activate() method of object's plugin. Plugin is either read from
+ * from stat-data or guessed from mode bits
+ * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
+ * plugins from parent.
+ *
+ * Easy induction proves that on last step all plugins of inode would be
+ * initialized.
+ *
+ * When creating new object:
+ * 1 obtain object plugin id (see next period)
+ * NIKITA-FIXME-HANS: period?
+ * 2 ->install() this plugin
+ * 3 ->inherit() the rest from the parent
+ *
+ * We need some examples of creating an object with default and non-default
+ * plugin ids. Nikita, please create them.
+ */
+
+#include "../inode.h"
+
+int _bugop(void)
+{
+ BUG_ON(1);
+ return 0;
+}
+
+#define bugop ((void *)_bugop)
+
+static int build_body_key_bugop(struct inode *inode, loff_t off,
+ reiser4_key *key)
+{
+ BUG_ON(1);
+ return 0;
+}
+
+static int _dummyop(void)
+{
+ return 0;
+}
+
+#define dummyop ((void *)_dummyop)
+
+static int change_file(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ /* cannot change object plugin of already existing object */
+ if (memb == PSET_FILE)
+ return RETERR(-EINVAL);
+
+ /* Change PSET_CREATE */
+ return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
+}
+
+static reiser4_plugin_ops file_plugin_ops = {
+ .change = change_file
+};
+
+static struct inode_operations null_i_ops = {.create = NULL};
+static struct file_operations null_f_ops = {.owner = NULL};
+static struct address_space_operations null_a_ops = {.writepage = NULL};
+
+/*
+ * Reiser4 provides for VFS either dispatcher, or common (fop,
+ * iop, aop) method.
+ *
+ * Dispatchers (suffixed with "dispatch") pass management to
+ * proper plugin in accordance with plugin table (pset) located
+ * in the private part of inode.
+ *
+ * Common methods are NOT prefixed with "dispatch". They are
+ * the same for all plugins of FILE interface, and, hence, no
+ * dispatching is needed.
+ */
+
+/*
+ * VFS methods for regular files
+ */
+static struct inode_operations regular_file_i_ops = {
+ .permission = reiser4_permission_common,
+ .setattr = reiser4_setattr_dispatch,
+ .getattr = reiser4_getattr_common
+};
+static struct file_operations regular_file_f_ops = {
+ .llseek = generic_file_llseek,
+ .read = reiser4_read_dispatch,
+ .write = reiser4_write_dispatch,
+ .read_iter = generic_file_read_iter,
+ .unlocked_ioctl = reiser4_ioctl_dispatch,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = reiser4_ioctl_dispatch,
+#endif
+ .mmap = reiser4_mmap_dispatch,
+ .open = reiser4_open_dispatch,
+ .release = reiser4_release_dispatch,
+ .fsync = reiser4_sync_file_common,
+ .splice_read = generic_file_splice_read,
+};
+static struct address_space_operations regular_file_a_ops = {
+ .writepage = reiser4_writepage,
+ .readpage = reiser4_readpage_dispatch,
+ //.sync_page = block_sync_page,
+ .writepages = reiser4_writepages_dispatch,
+ .set_page_dirty = reiser4_set_page_dirty,
+ .readpages = reiser4_readpages_dispatch,
+ .write_begin = reiser4_write_begin_dispatch,
+ .write_end = reiser4_write_end_dispatch,
+ .bmap = reiser4_bmap_dispatch,
+ .invalidatepage = reiser4_invalidatepage,
+ .releasepage = reiser4_releasepage,
+ .migratepage = reiser4_migratepage,
+ .batch_lock_tabu = 1
+};
+
+/* VFS methods for symlink files */
+static struct inode_operations symlink_file_i_ops = {
+ .get_link = reiser4_get_link_common,
+ .permission = reiser4_permission_common,
+ .setattr = reiser4_setattr_common,
+ .getattr = reiser4_getattr_common
+};
+
+/* VFS methods for special files */
+static struct inode_operations special_file_i_ops = {
+ .permission = reiser4_permission_common,
+ .setattr = reiser4_setattr_common,
+ .getattr = reiser4_getattr_common
+};
+
+/* VFS methods for directories */
+static struct inode_operations directory_i_ops = {
+ .create = reiser4_create_common,
+ .lookup = reiser4_lookup_common,
+ .link = reiser4_link_common,
+ .unlink = reiser4_unlink_common,
+ .symlink = reiser4_symlink_common,
+ .mkdir = reiser4_mkdir_common,
+ .rmdir = reiser4_unlink_common,
+ .mknod = reiser4_mknod_common,
+ .rename = reiser4_rename2_common,
+ .permission = reiser4_permission_common,
+ .setattr = reiser4_setattr_common,
+ .getattr = reiser4_getattr_common
+};
+static struct file_operations directory_f_ops = {
+ .llseek = reiser4_llseek_dir_common,
+ .read = generic_read_dir,
+ .iterate = reiser4_iterate_common,
+ .release = reiser4_release_dir_common,
+ .fsync = reiser4_sync_common,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = reiser4_ioctl_dir_common,
+#endif
+ .unlocked_ioctl = reiser4_ioctl_dir_common
+};
+static struct address_space_operations directory_a_ops = {
+ .writepages = dummyop,
+};
+
+/*
+ * Definitions of object plugins.
+ */
+
+file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
+ [UNIX_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = UNIX_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_REGULAR_FILE),
+ .pops = &file_plugin_ops,
+ .label = "reg",
+ .desc = "regular file",
+ .linkage = {NULL, NULL},
+ },
+ /*
+ * invariant vfs ops
+ */
+ .inode_ops = &regular_file_i_ops,
+ .file_ops = &regular_file_f_ops,
+ .as_ops = &regular_file_a_ops,
+ /*
+ * private i_ops
+ */
+ .setattr = setattr_unix_file,
+ .open = open_unix_file,
+ .read = read_unix_file,
+ .write = write_unix_file,
+ .ioctl = ioctl_unix_file,
+ .mmap = mmap_unix_file,
+ .release = release_unix_file,
+ /*
+ * private f_ops
+ */
+ .readpage = readpage_unix_file,
+ .readpages = readpages_unix_file,
+ .writepages = writepages_unix_file,
+ .write_begin = write_begin_unix_file,
+ .write_end = write_end_unix_file,
+ /*
+ * private a_ops
+ */
+ .bmap = bmap_unix_file,
+ /*
+ * other private methods
+ */
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .build_body_key = build_body_key_unix_file,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_common,
+ .create_object = reiser4_create_object_common,
+ .delete_object = delete_object_unix_file,
+ .add_link = reiser4_add_link_common,
+ .rem_link = reiser4_rem_link_common,
+ .owns_item = owns_item_unix_file,
+ .can_add_link = can_add_link_common,
+ .detach = dummyop,
+ .bind = dummyop,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common
+ },
+ .init_inode_data = init_inode_data_unix_file,
+ .cut_tree_worker = cut_tree_worker_common,
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ }
+ },
+ [DIRECTORY_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = DIRECTORY_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_DIRECTORY_FILE),
+ .pops = &file_plugin_ops,
+ .label = "dir",
+ .desc = "directory",
+ .linkage = {NULL, NULL}
+ },
+ .inode_ops = &null_i_ops,
+ .file_ops = &null_f_ops,
+ .as_ops = &null_a_ops,
+
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .build_body_key = build_body_key_bugop,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_common_dir,
+ .create_object = reiser4_create_object_common,
+ .delete_object = reiser4_delete_dir_common,
+ .add_link = reiser4_add_link_common,
+ .rem_link = rem_link_common_dir,
+ .owns_item = owns_item_common_dir,
+ .can_add_link = can_add_link_common,
+ .can_rem_link = can_rem_link_common_dir,
+ .detach = reiser4_detach_common_dir,
+ .bind = reiser4_bind_common_dir,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common_dir,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common_dir
+ },
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ },
+ .init_inode_data = init_inode_ordering,
+ .cut_tree_worker = cut_tree_worker_common,
+ },
+ [SYMLINK_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = SYMLINK_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_SYMLINK_FILE),
+ .pops = &file_plugin_ops,
+ .label = "symlink",
+ .desc = "symbolic link",
+ .linkage = {NULL,NULL}
+ },
+ .inode_ops = &symlink_file_i_ops,
+ /* inode->i_fop of symlink is initialized
+ by NULL in setup_inode_ops */
+ .file_ops = &null_f_ops,
+ .as_ops = &null_a_ops,
+
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_common,
+ .create_object = reiser4_create_symlink,
+ .delete_object = reiser4_delete_object_common,
+ .add_link = reiser4_add_link_common,
+ .rem_link = reiser4_rem_link_common,
+ .can_add_link = can_add_link_common,
+ .detach = dummyop,
+ .bind = dummyop,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common
+ },
+ .init_inode_data = init_inode_ordering,
+ .cut_tree_worker = cut_tree_worker_common,
+ .destroy_inode = destroy_inode_symlink,
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ }
+ },
+ [SPECIAL_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = SPECIAL_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_SPECIAL_FILE),
+ .pops = &file_plugin_ops,
+ .label = "special",
+ .desc =
+ "special: fifo, device or socket",
+ .linkage = {NULL, NULL}
+ },
+ .inode_ops = &special_file_i_ops,
+ /* file_ops of special files (sockets, block, char, fifo) are
+ initialized by init_special_inode. */
+ .file_ops = &null_f_ops,
+ .as_ops = &null_a_ops,
+
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_common,
+ .create_object = reiser4_create_object_common,
+ .delete_object = reiser4_delete_object_common,
+ .add_link = reiser4_add_link_common,
+ .rem_link = reiser4_rem_link_common,
+ .owns_item = owns_item_common,
+ .can_add_link = can_add_link_common,
+ .detach = dummyop,
+ .bind = dummyop,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common
+ },
+ .init_inode_data = init_inode_ordering,
+ .cut_tree_worker = cut_tree_worker_common,
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ }
+ },
+ [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_REGULAR_FILE),
+ .pops = &file_plugin_ops,
+ .label = "cryptcompress",
+ .desc = "cryptcompress file",
+ .linkage = {NULL, NULL}
+ },
+ .inode_ops = &regular_file_i_ops,
+ .file_ops = &regular_file_f_ops,
+ .as_ops = &regular_file_a_ops,
+
+ .setattr = setattr_cryptcompress,
+ .open = open_cryptcompress,
+ .read = read_cryptcompress,
+ .write = write_cryptcompress,
+ .ioctl = ioctl_cryptcompress,
+ .mmap = mmap_cryptcompress,
+ .release = release_cryptcompress,
+
+ .readpage = readpage_cryptcompress,
+ .readpages = readpages_cryptcompress,
+ .writepages = writepages_cryptcompress,
+ .write_begin = write_begin_cryptcompress,
+ .write_end = write_end_cryptcompress,
+
+ .bmap = bmap_cryptcompress,
+
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .build_body_key = build_body_key_cryptcompress,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_cryptcompress,
+ .create_object = create_object_cryptcompress,
+ .delete_object = delete_object_cryptcompress,
+ .add_link = reiser4_add_link_common,
+ .rem_link = reiser4_rem_link_common,
+ .owns_item = owns_item_common,
+ .can_add_link = can_add_link_common,
+ .detach = dummyop,
+ .bind = dummyop,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common
+ },
+ .init_inode_data = init_inode_data_cryptcompress,
+ .cut_tree_worker = cut_tree_worker_cryptcompress,
+ .destroy_inode = destroy_inode_cryptcompress,
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ }
+ },
+ [STRIPED_FILE_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .id = STRIPED_FILE_PLUGIN_ID,
+ .groups = (1 << REISER4_REGULAR_FILE),
+ .pops = &file_plugin_ops,
+ .label = "stripe",
+ .desc = "striped file",
+ .linkage = {NULL, NULL},
+ },
+ /*
+ * invariant vfs ops
+ */
+ .inode_ops = &regular_file_i_ops,
+ .file_ops = &regular_file_f_ops,
+ .as_ops = &regular_file_a_ops,
+ /*
+ * private i_ops
+ */
+ .setattr = setattr_stripe,
+ .open = open_stripe,
+ .read = read_stripe,
+ .write = write_stripe,
+ .ioctl = ioctl_stripe,
+ .mmap = mmap_cryptcompress,
+ .release = release_stripe,
+ /*
+ * private f_ops
+ */
+ .readpage = readpage_stripe,
+ .readpages = readpages_stripe,
+ .writepages = writepages_stripe,
+ .write_begin = write_begin_stripe,
+ .write_end = write_end_stripe,
+ /*
+ * private a_ops
+ */
+ .bmap = bmap_unix_file,
+ /*
+ * other private methods
+ */
+ .write_sd_by_inode = write_sd_by_inode_common,
+ .build_body_key = build_body_key_stripe,
+ .set_plug_in_inode = set_plug_in_inode_common,
+ .adjust_to_parent = adjust_to_parent_common,
+ .create_object = create_object_stripe,
+ .delete_object = delete_object_stripe,
+ .add_link = reiser4_add_link_common,
+ .rem_link = reiser4_rem_link_common,
+ .owns_item = owns_item_unix_file,
+ .can_add_link = can_add_link_common,
+ .detach = dummyop,
+ .bind = dummyop,
+ .safelink = safelink_common,
+ .estimate = {
+ .create = estimate_create_common,
+ .update = estimate_update_common,
+ .unlink = estimate_unlink_common
+ },
+ .init_inode_data = init_inode_data_unix_file,
+ .cut_tree_worker = cut_tree_worker_stripe,
+ .migrate = migrate_stripe,
+ .wire = {
+ .write = wire_write_common,
+ .read = wire_read_common,
+ .get = wire_get_common,
+ .size = wire_size_common,
+ .done = wire_done_common
+ }
+ }
+};
+
+static int change_dir(struct inode *inode,
+ reiser4_plugin * plugin,
+ pset_member memb)
+{
+ /* cannot change dir plugin of already existing object */
+ return RETERR(-EINVAL);
+}
+
+static reiser4_plugin_ops dir_plugin_ops = {
+ .change = change_dir
+};
+
+/*
+ * definition of directory plugins
+ */
+
+dir_plugin dir_plugins[LAST_DIR_ID] = {
+ /* standard hashed directory plugin */
+ [HASHED_DIR_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_DIR_PLUGIN_TYPE,
+ .id = HASHED_DIR_PLUGIN_ID,
+ .pops = &dir_plugin_ops,
+ .label = "dir",
+ .desc = "hashed directory",
+ .linkage = {NULL, NULL}
+ },
+ .inode_ops = &directory_i_ops,
+ .file_ops = &directory_f_ops,
+ .as_ops = &directory_a_ops,
+
+ .get_parent = get_parent_common,
+ .is_name_acceptable = is_name_acceptable_common,
+ .build_entry_key = build_entry_key_hashed,
+ .build_readdir_key = build_readdir_key_common,
+ .add_entry = reiser4_add_entry_common,
+ .rem_entry = reiser4_rem_entry_common,
+ .init = reiser4_dir_init_common,
+ .done = reiser4_dir_done_common,
+ .attach = reiser4_attach_common,
+ .detach = reiser4_detach_common,
+ .estimate = {
+ .add_entry = estimate_add_entry_common,
+ .rem_entry = estimate_rem_entry_common,
+ .unlink = dir_estimate_unlink_common
+ }
+ },
+ /* hashed directory for which seekdir/telldir are guaranteed to
+ * work. Brain-damage. */
+ [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
+ .h = {
+ .type_id = REISER4_DIR_PLUGIN_TYPE,
+ .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
+ .pops = &dir_plugin_ops,
+ .label = "dir32",
+ .desc = "directory hashed with 31 bit hash",
+ .linkage = {NULL, NULL}
+ },
+ .inode_ops = &directory_i_ops,
+ .file_ops = &directory_f_ops,
+ .as_ops = &directory_a_ops,
+
+ .get_parent = get_parent_common,
+ .is_name_acceptable = is_name_acceptable_common,
+ .build_entry_key = build_entry_key_seekable,
+ .build_readdir_key = build_readdir_key_common,
+ .add_entry = reiser4_add_entry_common,
+ .rem_entry = reiser4_rem_entry_common,
+ .init = reiser4_dir_init_common,
+ .done = reiser4_dir_done_common,
+ .attach = reiser4_attach_common,
+ .detach = reiser4_detach_common,
+ .estimate = {
+ .add_entry = estimate_add_entry_common,
+ .rem_entry = estimate_rem_entry_common,
+ .unlink = dir_estimate_unlink_common
+ }
+ }
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/object.h linux-5.10.2/fs/reiser4/plugin/object.h
--- linux-5.10.2.orig/fs/reiser4/plugin/object.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/object.h 2020-12-23 16:07:46.130813304 +0100
@@ -0,0 +1,119 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Declaration of object plugin functions. */
+
+#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__)
+#define __FS_REISER4_PLUGIN_OBJECT_H__
+
+#include "../type_safe_hash.h"
+
+/* common implementations of inode operations */
+int reiser4_create_common(struct inode *parent, struct dentry *dentry,
+ umode_t mode, bool);
+struct dentry *reiser4_lookup_common(struct inode *parent,
+ struct dentry *dentry,
+ unsigned int);
+int reiser4_link_common(struct dentry *existing, struct inode *parent,
+ struct dentry *newname);
+int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
+int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, umode_t mode);
+int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
+ const char *linkname);
+int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
+ umode_t mode, dev_t rdev);
+int reiser4_rename2_common(struct inode *old_dir, struct dentry *old_name,
+ struct inode *new_dir, struct dentry *new_name,
+ unsigned flags);
+const char *reiser4_get_link_common(struct dentry *, struct inode *inode,
+ struct delayed_call *done);
+int reiser4_permission_common(struct inode *, int mask);
+int reiser4_setattr_common(struct dentry *, struct iattr *);
+int reiser4_getattr_common(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
+
+/* common implementations of file operations */
+loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
+int reiser4_iterate_common(struct file *, struct dir_context *context);
+int reiser4_release_dir_common(struct inode *, struct file *);
+int reiser4_sync_common(struct file *, loff_t, loff_t, int datasync);
+
+/* file plugin operations: common implementations */
+void build_body_key_common(struct inode *inode, reiser4_key *key);
+int write_sd_by_inode_common(struct inode *, oid_t *oid);
+int set_plug_in_inode_common(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *);
+int adjust_to_parent_common(struct inode *object, struct inode *parent,
+ struct inode *root);
+int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
+ struct inode *root);
+int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
+ struct inode *root);
+int reiser4_create_object_common(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *, oid_t*);
+int reiser4_delete_object_common(struct inode *);
+int reiser4_delete_dir_common(struct inode *);
+int reiser4_add_link_common(struct inode *object, struct inode *parent);
+int reiser4_rem_link_common(struct inode *object, struct inode *parent);
+int rem_link_common_dir(struct inode *object, struct inode *parent);
+int owns_item_common(const struct inode *, const coord_t *);
+int owns_item_common_dir(const struct inode *, const coord_t *);
+int can_add_link_common(const struct inode *);
+int can_rem_link_common_dir(const struct inode *);
+int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
+int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
+int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
+reiser4_block_nr estimate_create_common(const struct inode *);
+reiser4_block_nr estimate_create_common_dir(const struct inode *);
+reiser4_block_nr estimate_update_common(const struct inode *);
+reiser4_block_nr estimate_unlink_common(const struct inode *,
+ const struct inode *);
+reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
+ const struct inode *);
+int reserve_update_sd_common(struct inode *inode);
+
+char *wire_write_common(struct inode *, char *start);
+char *wire_read_common(char *addr, reiser4_object_on_wire *);
+struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
+int wire_size_common(struct inode *);
+void wire_done_common(reiser4_object_on_wire *);
+
+/* dir plugin operations: common implementations */
+struct dentry *get_parent_common(struct inode *child);
+int is_name_acceptable_common(const struct inode *, const char *name, int len);
+void build_entry_key_common(const struct inode *,
+ const struct qstr *qname, reiser4_key *);
+int build_readdir_key_common(struct file *dir, reiser4_key *);
+int reiser4_add_entry_common(struct inode *object, struct dentry *where,
+ reiser4_object_create_data * , reiser4_dir_entry_desc *);
+int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
+ reiser4_dir_entry_desc *);
+int reiser4_dir_init_common(struct inode *object, struct inode *parent,
+ reiser4_object_create_data *);
+int reiser4_dir_done_common(struct inode *);
+int reiser4_attach_common(struct inode *child, struct inode *parent);
+int reiser4_detach_common(struct inode *object, struct inode *parent);
+reiser4_block_nr estimate_add_entry_common(const struct inode *);
+reiser4_block_nr estimate_rem_entry_common(const struct inode *);
+reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
+ const struct inode *);
+
+/* these are essential parts of common implementations, they are to make
+ customized implementations easier */
+
+/* merely useful functions */
+int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * ,
+ const reiser4_key * , lookup_bias bias, int silent);
+
+/* __FS_REISER4_PLUGIN_OBJECT_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/plugin.c linux-5.10.2/fs/reiser4/plugin/plugin.c
--- linux-5.10.2.orig/fs/reiser4/plugin/plugin.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/plugin.c 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,587 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Basic plugin infrastructure, lookup etc. */
+
+/* PLUGINS:
+
+ Plugins are internal Reiser4 "modules" or "objects" used to increase
+ extensibility and allow external users to easily adapt reiser4 to
+ their needs.
+
+ Plugins are classified into several disjoint "types". Plugins
+ belonging to the particular plugin type are termed "instances" of
+ this type. Existing types are listed by enum reiser4_plugin_type
+ (see plugin/plugin_header.h)
+
+NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
+
+ Object (file) plugin determines how given file-system object serves
+ standard VFS requests for read, write, seek, mmap etc. Instances of
+ file plugins are: regular file, directory, symlink. Another example
+ of file plugin is audit plugin, that optionally records accesses to
+ underlying object and forwards requests to it.
+
+ Hash plugins compute hashes used by reiser4 to store and locate
+ files within directories. Instances of hash plugin type are: r5,
+ tea, rupasov.
+
+ Tail plugins (or, more precisely, tail policy plugins) determine
+ when last part of the file should be stored in a formatted item.
+
+ Scope and lookup:
+
+ label such that pair ( type_label, plugin_label ) is unique. This
+ pair is a globally persistent and user-visible plugin
+ identifier. Internally kernel maintains plugins and plugin types in
+ arrays using an index into those arrays as plugin and plugin type
+ identifiers. File-system in turn, also maintains persistent
+ "dictionary" which is mapping from plugin label to numerical
+ identifier which is stored in file-system objects. That is, we
+ store the offset into the plugin array for that plugin type as the
+ plugin id in the stat data of the filesystem object.
+
+ Internal kernel plugin type identifier (index in plugins[] array) is
+ of type reiser4_plugin_type. Set of available plugin types is
+ currently static, but dynamic loading doesn't seem to pose
+ insurmountable problems.
+
+ Within each type plugins are addressed by the identifiers of type
+ reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
+ Such identifiers are only required to be unique within one type,
+ not globally.
+
+ Thus, plugin in memory is uniquely identified by the pair (type_id,
+ id).
+
+ Usage:
+
+ There exists only one instance of each plugin instance, but this
+ single instance can be associated with many entities (file-system
+ objects, items, nodes, transactions, file-descriptors etc.). Entity
+ to which plugin of given type is termed (due to the lack of
+ imagination) "subject" of this plugin type and, by abuse of
+ terminology, subject of particular instance of this type to which
+ it's attached currently. For example, inode is subject of object
+ plugin type. Inode representing directory is subject of directory
+ plugin, hash plugin type and some particular instance of hash plugin
+ type. Inode, representing regular file is subject of "regular file"
+ plugin, tail-policy plugin type etc.
+
+ With each subject the plugin possibly stores some state. For example,
+ the state of a directory plugin (instance of object plugin type) is pointer
+ to hash plugin (if directories always use hashing that is).
+
+ Interface:
+
+ In addition to a scalar identifier, each plugin type and plugin
+ proper has a "label": short string and a "description"---longer
+ descriptive string. Labels and descriptions of plugin types are
+ hard-coded into plugins[] array, declared and defined in
+ plugin.c. Label and description of plugin are stored in .label and
+ .desc fields of reiser4_plugin_header respectively. It's possible to
+ locate plugin by the pair of labels.
+
+ Features (not implemented):
+
+ . user-level plugin manipulations:
+ + reiser4("filename/..file_plugin<='audit'");
+ + write(open("filename/..file_plugin"), "audit", 8);
+
+ . user level utilities lsplug and chplug to manipulate plugins.
+ Utilities are not of primary priority. Possibly they will be not
+ working on v4.0
+
+ NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
+ option, do you agree? I don't think that specifying it at mount time,
+ and then changing it with each mount, is a good model for usage.
+
+ . mount option "plug" to set-up plugins of root-directory.
+ "plug=foo:bar" will set "bar" as default plugin of type "foo".
+
+ Limitations:
+
+ . each plugin type has to provide at least one builtin
+ plugin. This is technical limitation and it can be lifted in the
+ future.
+
+ TODO:
+
+ New plugin types/plugings:
+ Things we should be able to separately choose to inherit:
+
+ security plugins
+
+ stat data
+
+ file bodies
+
+ file plugins
+
+ dir plugins
+
+ . perm:acl
+
+ . audi---audit plugin intercepting and possibly logging all
+ accesses to object. Requires to put stub functions in file_operations
+ in stead of generic_file_*.
+
+NIKITA-FIXME-HANS: why make overflows a plugin?
+ . over---handle hash overflows
+
+ . sqnt---handle different access patterns and instruments read-ahead
+
+NIKITA-FIXME-HANS: describe the line below in more detail.
+
+ . hier---handle inheritance of plugins along file-system hierarchy
+
+ Different kinds of inheritance: on creation vs. on access.
+ Compatible/incompatible plugins.
+ Inheritance for multi-linked files.
+ Layered plugins.
+ Notion of plugin context is abandoned.
+
+Each file is associated
+ with one plugin and dependant plugins (hash, etc.) are stored as
+ main plugin state. Now, if we have plugins used for regular files
+ but not for directories, how such plugins would be inherited?
+ . always store them with directories also
+
+NIKTIA-FIXME-HANS: Do the line above. It is not exclusive of doing
+the line below which is also useful.
+
+ . use inheritance hierarchy, independent of file-system namespace
+*/
+
+#include "../debug.h"
+#include "../dformat.h"
+#include "plugin_header.h"
+#include "item/static_stat.h"
+#include "node/node.h"
+#include "security/perm.h"
+#include "space/space_allocator.h"
+#include "disk_format/disk_format.h"
+#include "plugin.h"
+#include "../reiser4.h"
+#include "../jnode.h"
+#include "../inode.h"
+
+#include <linux/fs.h> /* for struct super_block */
+
+/*
+ * init_plugins - initialize plugin sub-system.
+ * Just call this once on reiser4 startup.
+ *
+ * Initializes plugin sub-system. It is part of reiser4 module
+ * initialization. For each plugin of each type init method is called and each
+ * plugin is put into list of plugins.
+ */
+int init_plugins(void)
+{
+ reiser4_plugin_type type_id;
+
+ for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
+ struct reiser4_plugin_type_data *ptype;
+ int i;
+
+ ptype = &plugins[type_id];
+ assert("nikita-3508", ptype->label != NULL);
+ assert("nikita-3509", ptype->type_id == type_id);
+
+ INIT_LIST_HEAD(&ptype->plugins_list);
+/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term
+ * builtin. */
+ for (i = 0; i < ptype->builtin_num; ++i) {
+ reiser4_plugin *plugin;
+
+ plugin = plugin_at(ptype, i);
+
+ if (plugin->h.label == NULL)
+ /* uninitialized slot encountered */
+ continue;
+ assert("nikita-3445", plugin->h.type_id == type_id);
+ plugin->h.id = i;
+ if (plugin->h.pops != NULL &&
+ plugin->h.pops->init != NULL) {
+ int result;
+
+ result = plugin->h.pops->init(plugin);
+ if (result != 0)
+ return result;
+ }
+ INIT_LIST_HEAD(&plugin->h.linkage);
+ list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
+ }
+ }
+ return 0;
+}
+
+/* true if plugin type id is valid */
+int is_plugin_type_valid(reiser4_plugin_type type)
+{
+ /* "type" is unsigned, so no comparison with 0 is
+ necessary */
+ return (type < REISER4_PLUGIN_TYPES);
+}
+
+/* true if plugin id is valid */
+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
+{
+ assert("nikita-1653", is_plugin_type_valid(type));
+ return id < plugins[type].builtin_num;
+}
+
+/* return plugin by its @type and @id.
+
+ Both arguments are checked for validness: this is supposed to be called
+ from user-level.
+
+NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
+user space, and passed to the filesystem by use of method files? Your
+comment really confused me on the first reading....
+
+*/
+reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
+ * unchecked */,
+ reiser4_plugin_id id /* plugin id,
+ * unchecked */)
+{
+ if (is_plugin_type_valid(type)) {
+ if (is_plugin_id_valid(type, id))
+ return plugin_at(&plugins[type], id);
+ else
+ /* id out of bounds */
+ warning("nikita-2913",
+ "Invalid plugin id: [%i:%i]", type, id);
+ } else
+ /* type_id out of bounds */
+ warning("nikita-2914", "Invalid type_id: %i", type);
+ return NULL;
+}
+
+/**
+ * save_plugin_id - store plugin id in disk format
+ * @plugin: plugin to convert
+ * @area: where to store result
+ *
+ * Puts id of @plugin in little endian format to address @area.
+ */
+int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
+ d16 * area/* where to store result */)
+{
+ assert("nikita-1261", plugin != NULL);
+ assert("nikita-1262", area != NULL);
+
+ put_unaligned(cpu_to_le16(plugin->h.id), area);
+ return 0;
+}
+
+/* list of all plugins of given type */
+struct list_head *get_plugin_list(reiser4_plugin_type type)
+{
+ assert("nikita-1056", is_plugin_type_valid(type));
+ return &plugins[type].plugins_list;
+}
+
+static void update_pset_mask(reiser4_inode * info, pset_member memb)
+{
+ struct dentry *rootdir;
+ reiser4_inode *root;
+
+ assert("edward-1443", memb != PSET_FILE);
+
+ rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
+ if (rootdir != NULL) {
+ root = reiser4_inode_data(rootdir->d_inode);
+ /*
+ * if inode is different from the default one, or we are
+ * changing plugin of root directory, update plugin_mask
+ */
+ if (aset_get(info->pset, memb) !=
+ aset_get(root->pset, memb) ||
+ info == root)
+ info->plugin_mask |= (1 << memb);
+ else
+ info->plugin_mask &= ~(1 << memb);
+ }
+}
+
+/* Get specified plugin set member from parent,
+ or from fs-defaults (if no parent is given) and
+ install the result to pset of @self */
+int grab_plugin_pset(struct inode *self,
+ struct inode *ancestor,
+ pset_member memb)
+{
+ reiser4_plugin *plug;
+ reiser4_inode *info;
+ int result = 0;
+
+ /* Do not grab if initialised already. */
+ info = reiser4_inode_data(self);
+ if (aset_get(info->pset, memb) != NULL)
+ return 0;
+ if (ancestor) {
+ reiser4_inode *parent;
+
+ parent = reiser4_inode_data(ancestor);
+ plug = aset_get(parent->hset, memb) ? :
+ aset_get(parent->pset, memb);
+ } else
+ plug = get_default_plugin(memb);
+
+ result = set_plugin(&info->pset, memb, plug);
+ if (result == 0) {
+ if (!ancestor || self->i_sb->s_root->d_inode != self)
+ update_pset_mask(info, memb);
+ }
+ return result;
+}
+
+/* Take missing pset members from root inode */
+int finish_pset(struct inode *inode)
+{
+ reiser4_plugin *plug;
+ reiser4_inode *root;
+ reiser4_inode *info;
+ pset_member memb;
+ int result = 0;
+
+ root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
+ info = reiser4_inode_data(inode);
+
+ assert("edward-1455", root != NULL);
+ assert("edward-1456", info != NULL);
+
+ /* file and directory plugins are already initialized. */
+ for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
+
+ /* Do not grab if initialised already. */
+ if (aset_get(info->pset, memb) != NULL)
+ continue;
+
+ plug = aset_get(root->pset, memb);
+ result = set_plugin(&info->pset, memb, plug);
+ if (result != 0)
+ break;
+ }
+ if (result != 0) {
+ warning("nikita-3447",
+ "Cannot set up plugins for %lli",
+ (unsigned long long)
+ get_inode_oid(inode));
+ }
+ return result;
+}
+
+int force_plugin_pset(struct inode *self, pset_member memb,
+ reiser4_plugin * plug)
+{
+ reiser4_inode *info;
+ int result = 0;
+
+ if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
+ /* Changing pset in the root object. */
+ return RETERR(-EINVAL);
+ }
+
+ info = reiser4_inode_data(self);
+ if (plug->h.pops != NULL && plug->h.pops->change != NULL)
+ result = plug->h.pops->change(self, plug, memb);
+ else
+ result = aset_set_unsafe(&info->pset, memb, plug);
+ if (result == 0) {
+ __u16 oldmask = info->plugin_mask;
+
+ update_pset_mask(info, memb);
+ if (oldmask != info->plugin_mask)
+ reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
+ }
+ return result;
+}
+
+struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
+ /* C90 initializers */
+ [REISER4_FILE_PLUGIN_TYPE] = {
+ .type_id = REISER4_FILE_PLUGIN_TYPE,
+ .label = "file",
+ .desc = "Object plugins",
+ .builtin_num = sizeof_array(file_plugins),
+ .builtin = file_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(file_plugin)
+ },
+ [REISER4_DIR_PLUGIN_TYPE] = {
+ .type_id = REISER4_DIR_PLUGIN_TYPE,
+ .label = "dir",
+ .desc = "Directory plugins",
+ .builtin_num = sizeof_array(dir_plugins),
+ .builtin = dir_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(dir_plugin)
+ },
+ [REISER4_HASH_PLUGIN_TYPE] = {
+ .type_id = REISER4_HASH_PLUGIN_TYPE,
+ .label = "hash",
+ .desc = "Directory hashes",
+ .builtin_num = sizeof_array(hash_plugins),
+ .builtin = hash_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(hash_plugin)
+ },
+ [REISER4_FIBRATION_PLUGIN_TYPE] = {
+ .type_id =
+ REISER4_FIBRATION_PLUGIN_TYPE,
+ .label = "fibration",
+ .desc = "Directory fibrations",
+ .builtin_num = sizeof_array(fibration_plugins),
+ .builtin = fibration_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(fibration_plugin)
+ },
+ [REISER4_CIPHER_PLUGIN_TYPE] = {
+ .type_id = REISER4_CIPHER_PLUGIN_TYPE,
+ .label = "cipher",
+ .desc = "Cipher plugins",
+ .builtin_num = sizeof_array(cipher_plugins),
+ .builtin = cipher_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(cipher_plugin)
+ },
+ [REISER4_DIGEST_PLUGIN_TYPE] = {
+ .type_id = REISER4_DIGEST_PLUGIN_TYPE,
+ .label = "digest",
+ .desc = "Digest plugins",
+ .builtin_num = sizeof_array(digest_plugins),
+ .builtin = digest_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(digest_plugin)
+ },
+ [REISER4_COMPRESSION_PLUGIN_TYPE] = {
+ .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .label = "compression",
+ .desc = "Compression plugins",
+ .builtin_num = sizeof_array(compression_plugins),
+ .builtin = compression_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(compression_plugin)
+ },
+ [REISER4_FORMATTING_PLUGIN_TYPE] = {
+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
+ .label = "formatting",
+ .desc = "Tail inlining policies",
+ .builtin_num = sizeof_array(formatting_plugins),
+ .builtin = formatting_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(formatting_plugin)
+ },
+ [REISER4_PERM_PLUGIN_TYPE] = {
+ .type_id = REISER4_PERM_PLUGIN_TYPE,
+ .label = "perm",
+ .desc = "Permission checks",
+ .builtin_num = sizeof_array(perm_plugins),
+ .builtin = perm_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(perm_plugin)
+ },
+ [REISER4_ITEM_PLUGIN_TYPE] = {
+ .type_id = REISER4_ITEM_PLUGIN_TYPE,
+ .label = "item",
+ .desc = "Item handlers",
+ .builtin_num = sizeof_array(item_plugins),
+ .builtin = item_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(item_plugin)
+ },
+ [REISER4_NODE_PLUGIN_TYPE] = {
+ .type_id = REISER4_NODE_PLUGIN_TYPE,
+ .label = "node",
+ .desc = "node layout handlers",
+ .builtin_num = sizeof_array(node_plugins),
+ .builtin = node_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(node_plugin)
+ },
+ [REISER4_SD_EXT_PLUGIN_TYPE] = {
+ .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
+ .label = "sd_ext",
+ .desc = "Parts of stat-data",
+ .builtin_num = sizeof_array(sd_ext_plugins),
+ .builtin = sd_ext_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(sd_ext_plugin)
+ },
+ [REISER4_FORMAT_PLUGIN_TYPE] = {
+ .type_id = REISER4_FORMAT_PLUGIN_TYPE,
+ .label = "disk_layout",
+ .desc = "defines filesystem on disk layout",
+ .builtin_num = sizeof_array(format_plugins),
+ .builtin = format_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(disk_format_plugin)
+ },
+ [REISER4_JNODE_PLUGIN_TYPE] = {
+ .type_id = REISER4_JNODE_PLUGIN_TYPE,
+ .label = "jnode",
+ .desc = "defines kind of jnode",
+ .builtin_num = sizeof_array(jnode_plugins),
+ .builtin = jnode_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(jnode_plugin)
+ },
+ [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
+ .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .label = "compression_mode",
+ .desc = "Defines compression mode",
+ .builtin_num = sizeof_array(compression_mode_plugins),
+ .builtin = compression_mode_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(compression_mode_plugin)
+ },
+ [REISER4_CLUSTER_PLUGIN_TYPE] = {
+ .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
+ .label = "cluster",
+ .desc = "Defines cluster size",
+ .builtin_num = sizeof_array(cluster_plugins),
+ .builtin = cluster_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(cluster_plugin)
+ },
+ [REISER4_TXMOD_PLUGIN_TYPE] = {
+ .type_id = REISER4_TXMOD_PLUGIN_TYPE,
+ .label = "txmod",
+ .desc = "Defines transaction model",
+ .builtin_num = sizeof_array(txmod_plugins),
+ .builtin = txmod_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(txmod_plugin)
+ },
+ [REISER4_DISTRIBUTION_PLUGIN_TYPE] = {
+ .type_id = REISER4_DISTRIBUTION_PLUGIN_TYPE,
+ .label = "distrib",
+ .desc = "Defines distribution of named objects",
+ .builtin_num = sizeof_array(distribution_plugins),
+ .builtin = distribution_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(distribution_plugin)
+ },
+ [REISER4_VOLUME_PLUGIN_TYPE] = {
+ .type_id = REISER4_VOLUME_PLUGIN_TYPE,
+ .label = "volume",
+ .desc = "Manages logical volumes",
+ .builtin_num = sizeof_array(volume_plugins),
+ .builtin = volume_plugins,
+ .plugins_list = {NULL, NULL},
+ .size = sizeof(volume_plugin)
+ }
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/plugin.h linux-5.10.2/fs/reiser4/plugin/plugin.h
--- linux-5.10.2.orig/fs/reiser4/plugin/plugin.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/plugin.h 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,1194 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Basic plugin data-types.
+ see fs/reiser4/plugin/plugin.c for details */
+
+#if !defined(__FS_REISER4_PLUGIN_TYPES_H__)
+#define __FS_REISER4_PLUGIN_TYPES_H__
+
+#include "../forward.h"
+#include "../debug.h"
+#include "../dformat.h"
+#include "../key.h"
+#include "../ioctl.h"
+#include "compress/compress.h"
+#include "crypto/cipher.h"
+#include "plugin_header.h"
+#include "item/static_stat.h"
+#include "item/internal.h"
+#include "item/sde.h"
+#include "item/cde.h"
+#include "item/item.h"
+#include "node/node.h"
+#include "node/node41.h"
+#include "security/perm.h"
+#include "fibration.h"
+
+#include "space/bitmap.h"
+#include "space/space_allocator.h"
+
+#include "disk_format/disk_format40.h"
+#include "disk_format/disk_format.h"
+
+#include <linux/fs.h> /* for struct super_block, address_space */
+#include <linux/mm.h> /* for struct page */
+#include <linux/buffer_head.h> /* for struct buffer_head */
+#include <linux/dcache.h> /* for struct dentry */
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+typedef struct reiser4_object_on_wire reiser4_object_on_wire;
+
+/*
+ * File plugin. Defines the set of methods that file plugins implement, some
+ * of which are optional.
+ *
+ * A file plugin offers to the caller an interface for IO ( writing to and/or
+ * reading from) to what the caller sees as one sequence of bytes. An IO to it
+ * may affect more than one physical sequence of bytes, or no physical sequence
+ * of bytes, it may affect sequences of bytes offered by other file plugins to
+ * the semantic layer, and the file plugin may invoke other plugins and
+ * delegate work to them, but its interface is structured for offering the
+ * caller the ability to read and/or write what the caller sees as being a
+ * single sequence of bytes.
+ *
+ * The file plugin must present a sequence of bytes to the caller, but it does
+ * not necessarily have to store a sequence of bytes, it does not necessarily
+ * have to support efficient tree traversal to any offset in the sequence of
+ * bytes (tail and extent items, whose keys contain offsets, do however provide
+ * efficient non-sequential lookup of any offset in the sequence of bytes).
+ *
+ * Directory plugins provide methods for selecting file plugins by resolving a
+ * name for them.
+ *
+ * The functionality other filesystems call an attribute, and rigidly tie
+ * together, we decompose into orthogonal selectable features of files. Using
+ * the terminology we will define next, an attribute is a perhaps constrained,
+ * perhaps static length, file whose parent has a uni-count-intra-link to it,
+ * which might be grandparent-major-packed, and whose parent has a deletion
+ * method that deletes it.
+ *
+ * File plugins can implement constraints.
+ *
+ * Files can be of variable length (e.g. regular unix files), or of static
+ * length (e.g. static sized attributes).
+ *
+ * An object may have many sequences of bytes, and many file plugins, but, it
+ * has exactly one objectid. It is usually desirable that an object has a
+ * deletion method which deletes every item with that objectid. Items cannot
+ * in general be found by just their objectids. This means that an object must
+ * have either a method built into its deletion plugin method for knowing what
+ * items need to be deleted, or links stored with the object that provide the
+ * plugin with a method for finding those items. Deleting a file within an
+ * object may or may not have the effect of deleting the entire object,
+ * depending on the file plugin's deletion method.
+ *
+ * LINK TAXONOMY:
+ *
+ * Many objects have a reference count, and when the reference count reaches 0
+ * the object's deletion method is invoked. Some links embody a reference
+ * count increase ("countlinks"), and others do not ("nocountlinks").
+ *
+ * Some links are bi-directional links ("bilinks"), and some are
+ * uni-directional("unilinks").
+ *
+ * Some links are between parts of the same object ("intralinks"), and some are
+ * between different objects ("interlinks").
+ *
+ * PACKING TAXONOMY:
+ *
+ * Some items of an object are stored with a major packing locality based on
+ * their object's objectid (e.g. unix directory items in plan A), and these are
+ * called "self-major-packed".
+ *
+ * Some items of an object are stored with a major packing locality based on
+ * their semantic parent object's objectid (e.g. unix file bodies in plan A),
+ * and these are called "parent-major-packed".
+ *
+ * Some items of an object are stored with a major packing locality based on
+ * their semantic grandparent, and these are called "grandparent-major-packed".
+ * Now carefully notice that we run into trouble with key length if we have to
+ * store a 8 byte major+minor grandparent based packing locality, an 8 byte
+ * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
+ * a 24 byte key. One of these fields must be sacrificed if an item is to be
+ * grandparent-major-packed, and which to sacrifice is left to the item author
+ * choosing to make the item grandparent-major-packed. You cannot make tail
+ * items and extent items grandparent-major-packed, though you could make them
+ * self-major-packed (usually they are parent-major-packed).
+ *
+ * In the case of ACLs (which are composed of fixed length ACEs which consist
+ * of {subject-type, subject, and permission bitmask} triples), it makes sense
+ * to not have an offset field in the ACE item key, and to allow duplicate keys
+ * for ACEs. Thus, the set of ACES for a given file is found by looking for a
+ * key consisting of the objectid of the grandparent (thus grouping all ACLs in
+ * a directory together), the minor packing locality of ACE, the objectid of
+ * the file, and 0.
+ *
+ * IO involves moving data from one location to another, which means that two
+ * locations must be specified, source and destination.
+ *
+ * This source and destination can be in the filesystem, or they can be a
+ * pointer in the user process address space plus a byte count.
+ *
+ * If both source and destination are in the filesystem, then at least one of
+ * them must be representable as a pure stream of bytes (which we call a flow,
+ * and define as a struct containing a key, a data pointer, and a length).
+ * This may mean converting one of them into a flow. We provide a generic
+ * cast_into_flow() method, which will work for any plugin supporting
+ * read_flow(), though it is inefficiently implemented in that it temporarily
+ * stores the flow in a buffer (Question: what to do with huge flows that
+ * cannot fit into memory? Answer: we must not convert them all at once. )
+ *
+ * Performing a write requires resolving the write request into a flow defining
+ * the source, and a method that performs the write, and a key that defines
+ * where in the tree the write is to go.
+ *
+ * Performing a read requires resolving the read request into a flow defining
+ * the target, and a method that performs the read, and a key that defines
+ * where in the tree the read is to come from.
+ *
+ * There will exist file plugins which have no pluginid stored on the disk for
+ * them, and which are only invoked by other plugins.
+ */
+
+/*
+ * This should be incremented in every release which adds one
+ * or more new plugins.
+ * NOTE: Make sure that respective marco is also incremented in
+ * the new release of reiser4progs.
+ */
+#define PLUGIN_LIBRARY_VERSION 3
+
+ /* enumeration of fields within plugin_set */
+typedef enum {
+ PSET_FILE,
+ PSET_DIR, /* PSET_FILE and PSET_DIR should be first
+ * elements: inode.c:read_inode() depends on
+ * this. */
+ PSET_PERM,
+ PSET_FORMATTING,
+ PSET_HASH,
+ PSET_FIBRATION,
+ PSET_SD,
+ PSET_DIR_ITEM,
+ PSET_CIPHER,
+ PSET_DIGEST,
+ PSET_COMPRESSION,
+ PSET_COMPRESSION_MODE,
+ PSET_CLUSTER,
+ PSET_CREATE,
+ PSET_LAST
+} pset_member;
+
+/* builtin file-plugins */
+typedef enum {
+ /* regular file */
+ UNIX_FILE_PLUGIN_ID,
+ /* directory */
+ DIRECTORY_FILE_PLUGIN_ID,
+ /* symlink */
+ SYMLINK_FILE_PLUGIN_ID,
+ /* for objects completely handled by the VFS: fifos, devices,
+ sockets */
+ SPECIAL_FILE_PLUGIN_ID,
+ /* regular cryptcompress file */
+ CRYPTCOMPRESS_FILE_PLUGIN_ID,
+ /* regular striped file */
+ STRIPED_FILE_PLUGIN_ID,
+ /* number of file plugins. Used as size of arrays to hold
+ file plugins. */
+ LAST_FILE_PLUGIN_ID
+} reiser4_file_id;
+
+typedef struct file_plugin {
+
+ /* generic fields */
+ plugin_header h;
+
+ /* VFS methods */
+ struct inode_operations * inode_ops;
+ struct file_operations * file_ops;
+ struct address_space_operations * as_ops;
+ /**
+ * Private methods. These are optional. If used they will allow you
+ * to minimize the amount of code needed to implement a deviation
+ * from some other method that also uses them.
+ */
+ /*
+ * private inode_ops
+ */
+ int (*setattr)(struct dentry *, struct iattr *);
+ /*
+ * private file_ops
+ */
+ /* do whatever is necessary to do when object is opened */
+ int (*open) (struct inode *inode, struct file *file);
+ ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
+ loff_t *off);
+ /* write as much as possible bytes from nominated @write_amount
+ * before plugin scheduling is occurred. Save scheduling state
+ * in @cont */
+ ssize_t (*write) (struct file *, const char __user *buf,
+ size_t write_amount, loff_t * off,
+ struct dispatch_context * cont);
+ int (*ioctl) (struct file *filp, unsigned int cmd, unsigned long arg);
+ int (*mmap) (struct file *, struct vm_area_struct *);
+ int (*release) (struct inode *, struct file *);
+ /*
+ * private a_ops
+ */
+ int (*readpage) (struct file *file, struct page *page);
+ int (*readpages)(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages);
+ int (*writepages)(struct address_space *mapping,
+ struct writeback_control *wbc);
+ int (*write_begin)(struct file *file, struct page *page,
+ loff_t pos, unsigned len, void **fsdata);
+ int (*write_end)(struct file *file, struct page *page,
+ loff_t pos, unsigned copied, void *fsdata);
+ sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
+ /* other private methods */
+ /* save inode cached stat-data onto disk. It was called
+ reiserfs_update_sd() in 3.x */
+ int (*write_sd_by_inode) (struct inode *, oid_t *oid);
+
+ /* Build file body key by inode and offset */
+ int (*build_body_key) (struct inode *, loff_t off, reiser4_key *);
+
+ /* NIKITA-FIXME-HANS: this comment is not as clear to others as you
+ * think.... */
+ /*
+ * set the plugin for a file. Called during file creation in creat()
+ * but not reiser4() unless an inode already exists for the file.
+ */
+ int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
+ reiser4_object_create_data *);
+
+ /* NIKITA-FIXME-HANS: comment and name seem to say different things,
+ * are you setting up the object itself also or just adjusting the
+ * parent?.... */
+ /* set up plugins for new @object created in @parent. @root is root
+ directory. */
+ int (*adjust_to_parent) (struct inode *object, struct inode *parent,
+ struct inode *root);
+ /*
+ * this does whatever is necessary to do when object is created. For
+ * instance, for unix files stat data is inserted. It is supposed to be
+ * called by create of struct inode_operations.
+ */
+ int (*create_object) (struct inode *object, struct inode *parent,
+ reiser4_object_create_data *, oid_t *oid);
+ /*
+ * this method should check REISER4_NO_SD and set REISER4_NO_SD on
+ * success. Deletion of an object usually includes removal of items
+ * building file body (for directories this is removal of "." and "..")
+ * and removal of stat-data item.
+ */
+ int (*delete_object) (struct inode *);
+
+ /* add link from @parent to @object */
+ int (*add_link) (struct inode *object, struct inode *parent);
+
+ /* remove link from @parent to @object */
+ int (*rem_link) (struct inode *object, struct inode *parent);
+
+ /*
+ * return true if item addressed by @coord belongs to @inode. This is
+ * used by read/write to properly slice flow into items in presence of
+ * multiple key assignment policies, because items of a file are not
+ * necessarily contiguous in a key space, for example, in a plan-b.
+ */
+ int (*owns_item) (const struct inode *, const coord_t *);
+
+ /* checks whether yet another hard links to this object can be
+ added */
+ int (*can_add_link) (const struct inode *);
+
+ /* checks whether hard links to this object can be removed */
+ int (*can_rem_link) (const struct inode *);
+
+ /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
+ detach of directory plugin to remove ".." */
+ int (*detach) (struct inode *child, struct inode *parent);
+
+ /* called when @child was just looked up in the @parent. It is not
+ empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
+ directory plugin */
+ int (*bind) (struct inode *child, struct inode *parent);
+
+ /* process safe-link during mount */
+ int (*safelink) (struct inode *object, reiser4_safe_link_t link,
+ __u64 value);
+
+ /* The couple of estimate methods for all file operations */
+ struct {
+ reiser4_block_nr(*create) (const struct inode *);
+ reiser4_block_nr(*update) (const struct inode *);
+ reiser4_block_nr(*unlink) (const struct inode *,
+ const struct inode *);
+ } estimate;
+
+ /*
+ * reiser4 specific part of inode has a union of structures which are
+ * specific to a plugin. This method is called when inode is read
+ * (read_inode) and when file is created (common_create_child) so that
+ * file plugin could initialize its inode data
+ */
+ void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
+ const reiser4_key *, int);
+
+ /*
+ * This method performs progressive deletion of items and whole nodes
+ * from right to left.
+ *
+ * @tap: the point deletion process begins from,
+ * @from_key: the beginning of the deleted key range,
+ * @to_key: the end of the deleted key range,
+ * @smallest_removed: the smallest removed key,
+ *
+ * @return: 0 if success, error code otherwise, -E_REPEAT means that
+ * long cut_tree operation was interrupted for allowing atom commit .
+ */
+ int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed, struct inode *,
+ int, int *);
+
+ /* called from ->destroy_inode() */
+ void (*destroy_inode) (struct inode *);
+ /*
+ * Migrate data blocks of a regular file specified by @inode.
+ * If @dst_id is not NULL, then migrate all the blocks to brick with
+ * @dst_id. Otherwise, migrate in accordance with current distribution
+ * table.
+ */
+ int (*migrate)(struct inode *object, u64 *dst_id);
+ /*
+ * methods to serialize object identify. This is used, for example, by
+ * reiser4_{en,de}code_fh().
+ */
+ struct {
+ /* store object's identity at @area */
+ char *(*write) (struct inode *inode, char *area);
+ /* parse object from wire to the @obj */
+ char *(*read) (char *area, reiser4_object_on_wire * obj);
+ /* given object identity in @obj, find or create its dentry */
+ struct dentry *(*get) (struct super_block *s,
+ reiser4_object_on_wire * obj);
+ /* how many bytes ->wire.write() consumes */
+ int (*size) (struct inode *inode);
+ /* finish with object identify */
+ void (*done) (reiser4_object_on_wire * obj);
+ } wire;
+} file_plugin;
+
+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
+
+struct reiser4_object_on_wire {
+ file_plugin *plugin;
+ union {
+ struct {
+ obj_key_id key_id;
+ } std;
+ void *generic;
+ } u;
+};
+
+/* builtin dir-plugins */
+typedef enum {
+ HASHED_DIR_PLUGIN_ID,
+ SEEKABLE_HASHED_DIR_PLUGIN_ID,
+ LAST_DIR_ID
+} reiser4_dir_id;
+
+typedef struct dir_plugin {
+ /* generic fields */
+ plugin_header h;
+
+ struct inode_operations * inode_ops;
+ struct file_operations * file_ops;
+ struct address_space_operations * as_ops;
+
+ /*
+ * private methods: These are optional. If used they will allow you to
+ * minimize the amount of code needed to implement a deviation from
+ * some other method that uses them. You could logically argue that
+ * they should be a separate type of plugin.
+ */
+
+ struct dentry *(*get_parent) (struct inode *childdir);
+
+ /*
+ * check whether "name" is acceptable name to be inserted into this
+ * object. Optionally implemented by directory-like objects. Can check
+ * for maximal length, reserved symbols etc
+ */
+ int (*is_name_acceptable) (const struct inode *inode, const char *name,
+ int len);
+
+ void (*build_entry_key) (const struct inode *dir /* directory where
+ * entry is (or will
+ * be) in.*/ ,
+ const struct qstr *name /* name of file
+ * referenced by this
+ * entry */ ,
+ reiser4_key * result /* resulting key of
+ * directory entry */ );
+ int (*build_readdir_key) (struct file *dir, reiser4_key * result);
+ int (*add_entry) (struct inode *object, struct dentry *where,
+ reiser4_object_create_data * data,
+ reiser4_dir_entry_desc * entry);
+ int (*rem_entry) (struct inode *object, struct dentry *where,
+ reiser4_dir_entry_desc * entry);
+
+ /*
+ * initialize directory structure for newly created object. For normal
+ * unix directories, insert dot and dotdot.
+ */
+ int (*init) (struct inode *object, struct inode *parent,
+ reiser4_object_create_data * data);
+
+ /* destroy directory */
+ int (*done) (struct inode *child);
+
+ /* called when @subdir was just looked up in the @dir */
+ int (*attach) (struct inode *subdir, struct inode *dir);
+ int (*detach) (struct inode *subdir, struct inode *dir);
+
+ struct {
+ reiser4_block_nr(*add_entry) (const struct inode *);
+ reiser4_block_nr(*rem_entry) (const struct inode *);
+ reiser4_block_nr(*unlink) (const struct inode *,
+ const struct inode *);
+ } estimate;
+} dir_plugin;
+
+extern dir_plugin dir_plugins[LAST_DIR_ID];
+
+typedef struct formatting_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* returns non-zero iff file's tail has to be stored
+ in a direct item. */
+ int (*have_tail) (const struct inode *inode, loff_t size);
+} formatting_plugin;
+
+/**
+ * Plugins of this interface implement different transaction models.
+ * Transaction model is a high-level block allocator, which assigns block
+ * numbers to dirty nodes, and, thereby, decides, how individual dirty
+ * nodes of an atom will be committed.
+ */
+typedef struct txmod_plugin {
+ /* generic fields */
+ plugin_header h;
+ /**
+ * Allocate a formatted node in the FORWARD PARENT-FIRST context.
+ * It can check and re-allocate locality of the formatted node in
+ * the tree.
+ * Was allocate_znode_loaded().
+ */
+ int (*forward_alloc_formatted)(znode *node, const coord_t *parent_coord,
+ flush_pos_t *pos);
+ /**
+ * Check if a formatted node should be re-allocated in the
+ * REVERSE PARENT-FIRST context.
+ * If it should be re-allocated, then return 1. Otherwise, return 0.
+ * Was reverse_relocate_test().
+ */
+ int (*reverse_should_realloc_formatted)(jnode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos);
+ /**
+ * allocate blocks in the FORWARD PARENT-FIRST context
+ * for unformatted nodes.
+ *
+ * This is called by handle_pos_on_twig to proceed extent unit
+ * flush_pos->coord is set to. It is to prepare for flushing
+ * sequence of not flushprepped nodes (slum). It supposes that
+ * slum starts at flush_pos->pos_in_unit position within the extent
+ */
+ int (*forward_alloc_unformatted)(flush_pos_t *flush_pos); //was reiser4_alloc_extent
+ /**
+ * allocale blocks for unformatted nodes in squeeze_right_twig().
+ * @coord is set to extent unit
+ */
+ squeeze_result (*squeeze_alloc_unformatted)(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *stop_key); // was_squalloc_extent
+} txmod_plugin;
+
+/*
+ * operations on an array of abstract buckets
+ */
+struct bucket_ops {
+ /* Get capacity of a bucket with serial number @idx
+ in the array @buckets */
+ u64 (*cap_at)(bucket_t *buckets, u64 idx);
+ /* Get apx of specified @bucket */
+ void *(*apx_of)(bucket_t bucket);
+ /* Get apx of a bucket with serial number @idx
+ in the array @buckets */
+ void *(*apx_at)(bucket_t *buckets, u64 idx);
+ /* Set apx @apx of a bucket with serial number @idx
+ in the array @buckets*/
+ void (*apx_set_at)(bucket_t *buckets, u64 idx, void *apx);
+ /* Get a pointer to apx length of a bucket with
+ serial number @idx in the array @buckets */
+ u64 *(*apx_lenp_at)(bucket_t *buckets, u64 idx);
+ /* translate bucket index in the array of abstract buckets
+ to bucket internal ID */
+ u64 (*idx2id)(u32 idx);
+ /* translate bucket internal ID to bucket index in the array
+ of abstract buckets */
+ u32 (*id2idx)(u64 id);
+ /* create array of abstract buckets */
+ bucket_t *(*create_buckets)(void);
+ /* release array of abstract buckets */
+ void (*free_buckets)(bucket_t *vec);
+ /* insert a bucket @new into array of abstract buckets @vec
+ at position pos */
+ bucket_t *(*insert_bucket)(bucket_t *vec, bucket_t new, u32 numb, u32 pos);
+ /* remove a bucket located at position @pos in the array of
+ abstract buckets @vec */
+ bucket_t *(*remove_bucket)(bucket_t *vec, u32 numb, u32 pos);
+ /* return space currently occupied in the abstract array of buckets */
+ u64 (*space_occupied)(void);
+};
+
+struct dist_regular_ops {
+ /* initialize distribution context */
+ int (*init)(reiser4_dcx *rdcx, void **tab, int nums_bits);
+
+ /* release distribution context */
+ void (*done)(void **tab);
+
+ /* Mapping. For each word @str of length @len it calculates and
+ returns internal ID of a bucket from a set of buckets, which
+ possess abstract capacities. The set of buckets is defined by
+ a configuration pounted out by @rdcx. While calculating that
+ configuration should be protected by some means (e.g. by rcu)
+ from being destroyed by some operation like adding/removing a
+ bucket, atc */
+ u64 (*lookup)(reiser4_dcx *rdcx, const struct inode *inode,
+ const char *str, int len, u32 seed, void *tab);
+
+ void (*replace)(reiser4_dcx *rdcx, void **target);
+ void (*free)(void *tab);
+};
+
+struct dist_volume_ops {
+ /* Initialize operation context */
+ int (*init)(void **tab, u64 num_buckets,
+ int num_sgs_bits, reiser4_dcx *rdcx);
+ /* Release operation context */
+ void (*done)(reiser4_dcx *rdcx);
+ /* Increase array capacity.
+ If @new is not NULL, then insert bucket @new at the position
+ @target_pos in the array. Otherwise, increase capacity of the
+ bucket located at that position */
+ int (*inc)(reiser4_dcx *rdcx, const void *tab,
+ u64 target_pos, bucket_t new);
+ /* Decrease array capacity.
+ If @old is not NULL, then remove bucket @old.
+ Otherwise, decrease capacity of the bucket located at position
+ @target_pos */
+ int (*dec)(reiser4_dcx *rdcx, const void *tab,
+ u64 target_pos, bucket_t old);
+ /* Increase current limit for number of buckets in array */
+ int (*spl)(reiser4_dcx *rdcx, const void *tab, u32 fact_bits);
+ /* Pack configuration for its storing on disk */
+ void (*pack)(reiser4_dcx *rdcx, char *to, u64 src_off, u64 count);
+ /* Extract configuration from disk */
+ void (*unpack)(reiser4_dcx *rdcx, void *tab,
+ char *from, u64 dst_off, u64 count);
+ /* Print configuration */
+ void (*dump)(reiser4_dcx *rdcx, void *tab,
+ char *to, u64 offset, u32 size);
+};
+
+typedef struct distribution_plugin {
+ /* generic fields */
+ plugin_header h;
+ u32 seg_bits; /* logarithm of segment size */
+ struct dist_regular_ops r;
+ struct dist_volume_ops v;
+} distribution_plugin;
+
+typedef struct volume_plugin {
+ /* generic fields */
+ plugin_header h;
+
+ /* Return meta-data brick internal ID */
+ u64 (*meta_subvol_id)(void);
+
+ /* Assign a target brick, where a chunk of data, defined by @inode
+ and @offset, should be stored on. Returns internal ID of the
+ target brick in the volume. Defines regular data distrubution
+ policy on the logical volume with configuration @conf */
+ u64 (*calc_brick)(lv_conf *conf, const struct inode *inode,
+ loff_t offset);
+ /* Find out, on which brick an extent of data blocks, defined by
+ @coord, is stored. Return internal ID of the found brick in the
+ volume */
+ u64 (*find_brick)(const coord_t *coord);
+ /* Load a portion of volume configuration contained
+ in its brick @subv. Normally is called at mount time */
+ int (*load_volume)(reiser4_subvol *subv);
+ /* Release resources associated with logical volume @vol.
+ Normally, is called at unmount time */
+ void (*done_volume)(reiser4_volume *vol);
+ /* Init logical volume @vol after loading its system info
+ from all its bricks */
+ int (*init_volume)(struct super_block *sb, reiser4_volume *vol);
+ /* Change data capacity of @brick to new @value */
+ int (*resize_brick)(reiser4_volume *vol, reiser4_subvol *brick,
+ long long value, int *need_balance);
+ /* Add @new brick to logical volume @vol */
+ int (*add_brick)(reiser4_volume *vol, reiser4_subvol *new);
+
+ /* Start brick removal. Build a new volume configuration, which
+ doesn't include @victim and move all data from the @victim to
+ other bricks of the volume @vol */
+ int (*remove_brick)(reiser4_volume *vol, reiser4_subvol *victim);
+
+ /* End brick removal. Release resources associated with the brick
+ @victim scheduled for removal. Should be called after successful
+ volume rebalancing, which moves out all data from @victim to
+ other bricks of the volume @vol */
+ int (*remove_brick_tail)(reiser4_volume *vol, reiser4_subvol *victim);
+
+ /* Print brick info */
+ int (*print_brick)(struct super_block *sb,
+ struct reiser4_vol_op_args *args);
+ /* Print volume info */
+ int (*print_volume)(struct super_block *sb,
+ struct reiser4_vol_op_args *args);
+ /* Increase current limit for number of bricks in a volume */
+ int (*scale_volume)(struct super_block *sb, unsigned factor_bits);
+ /*
+ * Migrate all data blocks of a regular file to a brick with
+ * serial number @dst_idx (as it is visible by user) */
+ int (*migrate_file)(struct inode *inode, u64 dst_idx);
+ /*
+ * Migrate data blocks of a logical volume in accordance with
+ * a distribution policy defined by volume configuration and
+ * control @flags
+ */
+ int (*balance_volume)(struct super_block *super, u32 flags);
+ struct bucket_ops bucket_ops;
+} volume_plugin;
+
+typedef struct hash_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* computes hash of the given name */
+ __u64(*hash) (const unsigned char *name, int len);
+} hash_plugin;
+
+typedef struct cipher_plugin {
+ /* generic fields */
+ plugin_header h;
+ struct crypto_blkcipher * (*alloc) (void);
+ void (*free) (struct crypto_blkcipher *tfm);
+ /* Offset translator. For each offset this returns (k * offset), where
+ k (k >= 1) is an expansion factor of the cipher algorithm.
+ For all symmetric algorithms k == 1. For asymmetric algorithms (which
+ inflate data) offset translation guarantees that all disk cluster's
+ units will have keys smaller then next cluster's one.
+ */
+ loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src);
+ /* Cipher algorithms can accept data only by chunks of cipher block
+ size. This method is to align any flow up to cipher block size when
+ we pass it to cipher algorithm. To align means to append padding of
+ special format specific to the cipher algorithm */
+ int (*align_stream) (__u8 *tail, int clust_size, int blocksize);
+ /* low-level key manager (check, install, etc..) */
+ int (*setkey) (struct crypto_tfm *tfm, const __u8 *key,
+ unsigned int keylen);
+ /* main text processing procedures */
+ void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
+ void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
+} cipher_plugin;
+
+typedef struct digest_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* fingerprint size in bytes */
+ int fipsize;
+ struct crypto_hash * (*alloc) (void);
+ void (*free) (struct crypto_hash *tfm);
+} digest_plugin;
+
+typedef struct compression_plugin {
+ /* generic fields */
+ plugin_header h;
+ int (*init) (void);
+ /* the maximum number of bytes the size of the "compressed" data can
+ * exceed the uncompressed data. */
+ int (*overrun) (unsigned src_len);
+ coa_t(*alloc) (tfm_action act);
+ void (*free) (coa_t coa, tfm_action act);
+ /* minimal size of the flow we still try to compress */
+ int (*min_size_deflate) (void);
+ __u32(*checksum) (char *data, __u32 length);
+ /* main transform procedures */
+ void (*compress) (coa_t coa, __u8 *src_first, size_t src_len,
+ __u8 *dst_first, size_t *dst_len);
+ void (*decompress) (coa_t coa, __u8 *src_first, size_t src_len,
+ __u8 *dst_first, size_t *dst_len);
+} compression_plugin;
+
+typedef struct compression_mode_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* this is called when estimating compressibility
+ of a logical cluster by its content */
+ int (*should_deflate) (struct inode *inode, cloff_t index);
+ /* this is called when results of compression should be saved */
+ int (*accept_hook) (struct inode *inode, cloff_t index);
+ /* this is called when results of compression should be discarded */
+ int (*discard_hook) (struct inode *inode, cloff_t index);
+} compression_mode_plugin;
+
+typedef struct cluster_plugin {
+ /* generic fields */
+ plugin_header h;
+ int shift;
+} cluster_plugin;
+
+typedef struct sd_ext_plugin {
+ /* generic fields */
+ plugin_header h;
+ int (*present) (struct inode *inode, char **area, int *len);
+ int (*absent) (struct inode *inode);
+ int (*save_len) (struct inode *inode);
+ int (*save) (struct inode *inode, char **area);
+ /* alignment requirement for this stat-data part */
+ int alignment;
+} sd_ext_plugin;
+
+/* this plugin contains methods to allocate objectid for newly created files,
+ to deallocate objectid when file gets removed, to report number of used and
+ free objectids */
+typedef struct oid_allocator_plugin {
+ /* generic fields */
+ plugin_header h;
+ int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
+ __u64 oids);
+ /* used to report statfs->f_files */
+ __u64(*oids_used) (reiser4_oid_allocator * map);
+ /* get next oid to use */
+ __u64(*next_oid) (reiser4_oid_allocator * map);
+ /* used to report statfs->f_ffree */
+ __u64(*oids_free) (reiser4_oid_allocator * map);
+ /* allocate new objectid */
+ int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
+ /* release objectid */
+ int (*release_oid) (reiser4_oid_allocator * map, oid_t);
+ /* how many pages to reserve in transaction for allocation of new
+ objectid */
+ int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
+ /* how many pages to reserve in transaction for freeing of an
+ objectid */
+ int (*oid_reserve_release) (reiser4_oid_allocator * map);
+ void (*print_info) (const char *, reiser4_oid_allocator *);
+} oid_allocator_plugin;
+
+/* disk layout plugin: this specifies super block, journal, bitmap (if there
+ are any) locations, etc */
+typedef struct disk_format_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* Read format super-block from disk, find internal subvolume ID
+ and store it in @subv_id */
+ int (*extract_subvol_id)(struct block_device *bdev, u64 *subv_id);
+ /* replay journal, initialize super_info_data, etc */
+ int (*init_format) (struct super_block *, reiser4_subvol *);
+ /* key of root directory stat data */
+ const reiser4_key * (*root_dir_key) (const struct super_block *);
+ int (*release_format) (struct super_block *, reiser4_subvol *);
+ jnode * (*log_super) (struct super_block *, reiser4_subvol *);
+ int (*check_open) (const struct inode *object);
+ /*
+ * Decide, if minor disk format version number in format
+ * super-block should be upgraded.
+ * If no, then return 0. If yes, then put format super-block into
+ * a transaction and return 1. In this case the caller should put
+ * additional efforts to commit that transaction (if needed).
+ * In the case of errors return <0.
+ */
+ int (*version_update) (struct super_block *, reiser4_subvol *);
+} disk_format_plugin;
+
+struct jnode_plugin {
+ /* generic fields */
+ plugin_header h;
+ int (*init) (jnode *node);
+ /* verify validness of node's content */
+ int (*parse) (jnode *node);
+ struct address_space *(*mapping) (const jnode *node);
+ unsigned long (*index) (const jnode *node);
+};
+
+/* plugin instance. */
+/* */
+/* This is "wrapper" union for all types of plugins. Most of the code uses */
+/* plugins of particular type (file_plugin, dir_plugin, etc.) rather than */
+/* operates with pointers to reiser4_plugin. This union is only used in */
+/* some generic code in plugin/plugin.c that operates on all */
+/* plugins. Technically speaking purpose of this union is to add type */
+/* safety to said generic code: each plugin type (file_plugin, for */
+/* example), contains plugin_header as its first memeber. This first member */
+/* is located at the same place in memory as .h member of */
+/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and */
+/* looks in the .h which is header of plugin type located in union. This */
+/* allows to avoid type-casts. */
+union reiser4_plugin {
+ /* generic fields */
+ plugin_header h;
+ /* file plugin */
+ file_plugin file;
+ /* directory plugin */
+ dir_plugin dir;
+ /* hash plugin, used by directory plugin */
+ hash_plugin hash;
+ /* fibration plugin used by directory plugin */
+ fibration_plugin fibration;
+ /* cipher transform plugin, used by file plugin */
+ cipher_plugin cipher;
+ /* digest transform plugin, used by file plugin */
+ digest_plugin digest;
+ /* compression transform plugin, used by file plugin */
+ compression_plugin compression;
+ /* tail plugin, used by file plugin */
+ formatting_plugin formatting;
+ /* permission plugin */
+ perm_plugin perm;
+ /* node plugin */
+ node_plugin node;
+ /* item plugin */
+ item_plugin item;
+ /* stat-data extension plugin */
+ sd_ext_plugin sd_ext;
+ /* disk layout plugin */
+ disk_format_plugin format;
+ /* object id allocator plugin */
+ oid_allocator_plugin oid_allocator;
+ /* plugin for different jnode types */
+ jnode_plugin jnode;
+ /* compression mode plugin, used by object plugin */
+ compression_mode_plugin compression_mode;
+ /* cluster plugin, used by object plugin */
+ cluster_plugin clust;
+ /* transaction mode plugin */
+ txmod_plugin txmod;
+ /* distribution plugin */
+ distribution_plugin distribution;
+ /* volume plugin */
+ volume_plugin volume;
+ /* place-holder for new plugin types that can be registered
+ dynamically, and used by other dynamically loaded plugins. */
+ void *generic;
+};
+
+struct reiser4_plugin_ops {
+ /* called when plugin is initialized */
+ int (*init) (reiser4_plugin * plugin);
+ /* called when plugin is unloaded */
+ int (*done) (reiser4_plugin * plugin);
+ /* load given plugin from disk */
+ int (*load) (struct inode *inode,
+ reiser4_plugin * plugin, char **area, int *len);
+ /* how many space is required to store this plugin's state
+ in stat-data */
+ int (*save_len) (struct inode *inode, reiser4_plugin * plugin);
+ /* save persistent plugin-data to disk */
+ int (*save) (struct inode *inode, reiser4_plugin * plugin,
+ char **area);
+ /* alignment requirement for on-disk state of this plugin
+ in number of bytes */
+ int alignment;
+ /* install itself into given inode. This can return error
+ (e.g., you cannot change hash of non-empty directory). */
+ int (*change) (struct inode *inode, reiser4_plugin * plugin,
+ pset_member memb);
+ /* install itself into given inode. This can return error
+ (e.g., you cannot change hash of non-empty directory). */
+ int (*inherit) (struct inode *inode, struct inode *parent,
+ reiser4_plugin * plugin);
+};
+
+/* functions implemented in fs/reiser4/plugin/plugin.c */
+
+/* stores plugin reference in reiser4-specific part of inode */
+extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
+extern int init_plugins(void);
+
+/* builtin plugins */
+
+/* builtin hash-plugins */
+
+typedef enum {
+ RUPASOV_HASH_ID,
+ R5_HASH_ID,
+ TEA_HASH_ID,
+ FNV1_HASH_ID,
+ DEGENERATE_HASH_ID,
+ LAST_HASH_ID
+} reiser4_hash_id;
+
+/* builtin cipher plugins */
+
+typedef enum {
+ NONE_CIPHER_ID,
+ LAST_CIPHER_ID
+} reiser4_cipher_id;
+
+/* builtin digest plugins */
+
+typedef enum {
+ SHA256_32_DIGEST_ID,
+ LAST_DIGEST_ID
+} reiser4_digest_id;
+
+/* builtin compression mode plugins */
+typedef enum {
+ NONE_COMPRESSION_MODE_ID,
+ LATTD_COMPRESSION_MODE_ID,
+ ULTIM_COMPRESSION_MODE_ID,
+ FORCE_COMPRESSION_MODE_ID,
+ CONVX_COMPRESSION_MODE_ID,
+ LAST_COMPRESSION_MODE_ID
+} reiser4_compression_mode_id;
+
+/* builtin cluster plugins */
+typedef enum {
+ CLUSTER_64K_ID,
+ CLUSTER_32K_ID,
+ CLUSTER_16K_ID,
+ CLUSTER_8K_ID,
+ CLUSTER_4K_ID,
+ LAST_CLUSTER_ID
+} reiser4_cluster_id;
+
+/* builtin tail packing policies */
+typedef enum {
+ NEVER_TAILS_FORMATTING_ID,
+ ALWAYS_TAILS_FORMATTING_ID,
+ SMALL_FILE_FORMATTING_ID,
+ LAST_TAIL_FORMATTING_ID
+} reiser4_formatting_id;
+
+/* builtin transaction models */
+typedef enum {
+ HYBRID_TXMOD_ID,
+ JOURNAL_TXMOD_ID,
+ WA_TXMOD_ID,
+ LAST_TXMOD_ID
+} reiser4_txmod_id;
+
+/* builtin distribution plugins */
+typedef enum {
+ TRIV_DISTRIB_ID, /* for simple volumes */
+ FSX32M_DISTRIB_ID, /* builtin distribution of Eduard Shishkin */
+ LAST_DISTRIB_ID
+} reiser4_distribution_id;
+
+/* builtin volume plugins */
+typedef enum {
+ SIMPLE_VOLUME_ID, /* for volumes 4.X.Y and simple volumes 5.X.Y */
+ ASYM_VOLUME_ID, /* for logical volumes 5.X.Y */
+ LAST_VOLUME_ID
+} reiser4_volume_id;
+
+/* data type used to pack parameters that we pass to vfs object creation
+ function create_object() */
+struct reiser4_object_create_data {
+ /* plugin to control created object */
+ reiser4_file_id id;
+ /* mode of regular file, directory or special file */
+/* what happens if some other sort of perm plugin is in use? */
+ umode_t mode;
+ /* rdev of special file */
+ dev_t rdev;
+ /* symlink target */
+ const char *name;
+ /* add here something for non-standard objects you invent, like
+ query for interpolation file etc. */
+
+ struct reiser4_crypto_info *crypto;
+
+ struct inode *parent;
+ struct dentry *dentry;
+};
+
+/* description of directory entry being created/destroyed/sought for
+
+ It is passed down to the directory plugin and farther to the
+ directory item plugin methods. Creation of new directory is done in
+ several stages: first we search for an entry with the same name, then
+ create new one. reiser4_dir_entry_desc is used to store some information
+ collected at some stage of this process and required later: key of
+ item that we want to insert/delete and pointer to an object that will
+ be bound by the new directory entry. Probably some more fields will
+ be added there.
+
+*/
+struct reiser4_dir_entry_desc {
+ /* key of directory entry */
+ reiser4_key key;
+ /* object bound by this entry. */
+ struct inode *obj;
+};
+
+#define MAX_PLUGIN_TYPE_LABEL_LEN 32
+#define MAX_PLUGIN_PLUG_LABEL_LEN 32
+
+#define PLUGIN_BY_ID(TYPE, ID, FIELD) \
+static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id) \
+{ \
+ reiser4_plugin *plugin = plugin_by_id(ID, id); \
+ return plugin ? &plugin->FIELD : NULL; \
+} \
+static inline TYPE *TYPE ## _by_disk_id(d16 *id) \
+{ \
+ reiser4_plugin *plugin = plugin_by_disk_id(ID, id); \
+ return plugin ? &plugin->FIELD : NULL; \
+} \
+static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id) \
+{ \
+ reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id); \
+ return plugin ? &plugin->FIELD : NULL; \
+} \
+static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin) \
+{ \
+ return (reiser4_plugin *) plugin; \
+} \
+static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin) \
+{ \
+ return TYPE ## _to_plugin(plugin)->h.id; \
+} \
+typedef struct { int foo; } TYPE ## _plugin_dummy
+
+
+static inline unsigned get_release_number_principal(void)
+{
+ return REISER4_VERSION_PRINCIPAL;
+}
+
+/**
+ * Guess principal format number by major release number
+ */
+static inline int get_format_number_principal(int major)
+{
+ switch (major) {
+ case 0:
+ case 1:
+ return 4 + major;
+ default:
+ impossible("edward-2429", "Unsupported major release number");
+ }
+ return 0;
+}
+
+static inline int get_release_number_major(void)
+{
+ return LAST_FORMAT_ID - 1;
+}
+
+static inline int get_release_number_minor(void)
+{
+ return PLUGIN_LIBRARY_VERSION;
+}
+
+PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
+PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
+PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
+PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
+PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
+PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
+PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
+PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
+PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
+PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
+PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
+PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
+PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
+PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
+PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ compression_mode);
+PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
+PLUGIN_BY_ID(txmod_plugin, REISER4_TXMOD_PLUGIN_TYPE, txmod);
+PLUGIN_BY_ID(distribution_plugin, REISER4_DISTRIBUTION_PLUGIN_TYPE,
+ distribution);
+PLUGIN_BY_ID(volume_plugin, REISER4_VOLUME_PLUGIN_TYPE, volume);
+
+extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
+
+extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
+
+#define for_all_plugins(ptype, plugin) \
+for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage); \
+ get_plugin_list(ptype) != &plugin->h.linkage; \
+ plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
+
+
+extern int grab_plugin_pset(struct inode *self, struct inode *ancestor,
+ pset_member memb);
+extern int force_plugin_pset(struct inode *self, pset_member memb,
+ reiser4_plugin *plug);
+extern int finish_pset(struct inode *inode);
+
+/* defined in fs/reiser4/plugin/object.c */
+extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
+/* defined in fs/reiser4/plugin/object.c */
+extern dir_plugin dir_plugins[LAST_DIR_ID];
+/* defined in fs/reiser4/plugin/item/static_stat.c */
+extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
+/* defined in fs/reiser4/plugin/hash.c */
+extern hash_plugin hash_plugins[LAST_HASH_ID];
+/* defined in fs/reiser4/plugin/fibration.c */
+extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
+/* defined in fs/reiser4/plugin/txmod.c */
+extern txmod_plugin txmod_plugins[LAST_TXMOD_ID];
+/* defined in fs/reiser4/plugin/distribution.c */
+extern distribution_plugin distribution_plugins[LAST_DISTRIB_ID];
+/* defined in fs/reiser4/plugin/volume.c */
+extern volume_plugin volume_plugins[LAST_VOLUME_ID];
+/* defined in fs/reiser4/plugin/crypt.c */
+extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
+/* defined in fs/reiser4/plugin/digest.c */
+extern digest_plugin digest_plugins[LAST_DIGEST_ID];
+/* defined in fs/reiser4/plugin/compress/compress.c */
+extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
+/* defined in fs/reiser4/plugin/compress/compression_mode.c */
+extern compression_mode_plugin
+compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
+/* defined in fs/reiser4/plugin/cluster.c */
+extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
+/* defined in fs/reiser4/plugin/tail.c */
+extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
+/* defined in fs/reiser4/plugin/security/security.c */
+extern perm_plugin perm_plugins[LAST_PERM_ID];
+/* defined in fs/reiser4/plugin/item/item.c */
+extern item_plugin item_plugins[LAST_ITEM_ID];
+/* defined in fs/reiser4/plugin/node/node.c */
+extern node_plugin node_plugins[LAST_NODE_ID];
+/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
+extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
+
+/* __FS_REISER4_PLUGIN_TYPES_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/plugin_header.h linux-5.10.2/fs/reiser4/plugin/plugin_header.h
--- linux-5.10.2.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/plugin_header.h 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,151 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* plugin header. Data structures required by all plugin types. */
+
+#if !defined(__PLUGIN_HEADER_H__)
+#define __PLUGIN_HEADER_H__
+
+/* plugin data-types and constants */
+
+#include "../debug.h"
+#include "../dformat.h"
+
+/* The list of Reiser4 interfaces */
+typedef enum {
+ REISER4_FILE_PLUGIN_TYPE, /* manage VFS objects */
+ REISER4_DIR_PLUGIN_TYPE, /* manage directories */
+ REISER4_ITEM_PLUGIN_TYPE, /* manage items */
+ REISER4_NODE_PLUGIN_TYPE, /* manage formatted nodes */
+ REISER4_HASH_PLUGIN_TYPE, /* hash methods */
+ REISER4_FIBRATION_PLUGIN_TYPE, /* directory fibrations */
+ REISER4_FORMATTING_PLUGIN_TYPE, /* dispatching policy */
+ REISER4_PERM_PLUGIN_TYPE, /* stub (vacancy) */
+ REISER4_SD_EXT_PLUGIN_TYPE, /* manage stat-data extensions */
+ REISER4_FORMAT_PLUGIN_TYPE, /* disk format specifications */
+ REISER4_JNODE_PLUGIN_TYPE, /* manage in-memory headers */
+ REISER4_CIPHER_PLUGIN_TYPE, /* cipher transform methods */
+ REISER4_DIGEST_PLUGIN_TYPE, /* digest transform methods */
+ REISER4_COMPRESSION_PLUGIN_TYPE, /* compression methods */
+ REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* dispatching policies */
+ REISER4_CLUSTER_PLUGIN_TYPE, /* manage logical clusters */
+ REISER4_TXMOD_PLUGIN_TYPE, /* transaction models */
+ REISER4_DISTRIBUTION_PLUGIN_TYPE, /* distribution algorithm */
+ REISER4_VOLUME_PLUGIN_TYPE, /* volume types */
+ REISER4_PLUGIN_TYPES
+} reiser4_plugin_type;
+
+/* Supported plugin groups */
+typedef enum {
+ REISER4_DIRECTORY_FILE,
+ REISER4_REGULAR_FILE,
+ REISER4_SYMLINK_FILE,
+ REISER4_SPECIAL_FILE,
+} file_plugin_group;
+
+struct reiser4_plugin_ops;
+/* generic plugin operations, supported by each
+ plugin type. */
+typedef struct reiser4_plugin_ops reiser4_plugin_ops;
+
+/* the common part of all plugin instances. */
+typedef struct plugin_header {
+ /* plugin type */
+ reiser4_plugin_type type_id;
+ /* id of this plugin */
+ reiser4_plugin_id id;
+ /* bitmask of groups the plugin belongs to. */
+ reiser4_plugin_groups groups;
+ /* plugin operations */
+ reiser4_plugin_ops *pops;
+/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and
+ * defined. */
+ /* short label of this plugin */
+ const char *label;
+ /* descriptive string.. */
+ const char *desc;
+ /* list linkage */
+ struct list_head linkage;
+} plugin_header;
+
+#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
+
+/* PRIVATE INTERFACES */
+/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in
+ * plugin_header? */
+/* plugin type representation. */
+struct reiser4_plugin_type_data {
+ /* internal plugin type identifier. Should coincide with
+ index of this item in plugins[] array. */
+ reiser4_plugin_type type_id;
+ /* short symbolic label of this plugin type. Should be no longer
+ than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
+ const char *label;
+ /* plugin type description longer than .label */
+ const char *desc;
+
+/* NIKITA-FIXME-HANS: define built-in */
+ /* number of built-in plugin instances of this type */
+ int builtin_num;
+ /* array of built-in plugins */
+ void *builtin;
+ struct list_head plugins_list;
+ size_t size;
+};
+
+extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
+
+int is_plugin_type_valid(reiser4_plugin_type type);
+int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
+
+static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype,
+ int i)
+{
+ char *builtin;
+
+ builtin = ptype->builtin;
+ return (reiser4_plugin *) (builtin + i * ptype->size);
+}
+
+/* return plugin by its @type_id and @id */
+static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
+ reiser4_plugin_id id)
+{
+ assert("nikita-1651", is_plugin_type_valid(type));
+ assert("nikita-1652", is_plugin_id_valid(type, id));
+ return plugin_at(&plugins[type], id);
+}
+
+extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
+ reiser4_plugin_id id);
+
+/**
+ * plugin_by_disk_id - get reiser4_plugin
+ * @type_id: plugin type id
+ * @did: plugin id in disk format
+ *
+ * Returns reiser4_plugin by plugin type id an dplugin_id.
+ */
+static inline reiser4_plugin *plugin_by_disk_id(reiser4_plugin_type type_id,
+ __le16 *plugin_id)
+{
+ /*
+ * what we should do properly is to maintain within each file-system a
+ * dictionary that maps on-disk plugin ids to "universal" ids. This
+ * dictionary will be resolved on mount time, so that this function
+ * will perform just one additional array lookup.
+ */
+ return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
+}
+
+/* __PLUGIN_HEADER_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/plugin_set.c linux-5.10.2/fs/reiser4/plugin/plugin_set.c
--- linux-5.10.2.orig/fs/reiser4/plugin/plugin_set.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/plugin_set.c 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,387 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+/* This file contains Reiser4 plugin set operations */
+
+/* plugin sets
+ *
+ * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
+ * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
+ * assigned (inherited, deduced from mode bits, etc.) at creation time. This
+ * set of plugins (so called pset) is described by structure plugin_set (see
+ * plugin/plugin_set.h), which contains pointers to all required plugins.
+ *
+ * Children can inherit some pset members from their parent, however sometimes
+ * it is useful to specify members different from parent ones. Since object's
+ * pset can not be easily changed without fatal consequences, we use for this
+ * purpose another special plugin table (so called hset, or heir set) described
+ * by the same structure.
+ *
+ * Inode only stores a pointers to pset and hset. Different inodes with the
+ * same set of pset (hset) members point to the same pset (hset). This is
+ * archived by storing psets and hsets in global hash table. Races are avoided
+ * by simple (and efficient so far) solution of never recycling psets, even
+ * when last inode pointing to it is destroyed.
+ */
+
+#include "../debug.h"
+#include "../super.h"
+#include "plugin_set.h"
+
+#include <linux/slab.h>
+#include <linux/stddef.h>
+
+/* slab for plugin sets */
+static struct kmem_cache *plugin_set_slab;
+
+static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[0]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[1]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[2]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[3]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[4]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[5]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[6]),
+ __SPIN_LOCK_UNLOCKED(plugin_set_lock[7])
+};
+
+/* hash table support */
+
+#define PS_TABLE_SIZE (32)
+
+static inline plugin_set *cast_to(const unsigned long *a)
+{
+ return container_of(a, plugin_set, hashval);
+}
+
+static inline int pseq(const unsigned long *a1, const unsigned long *a2)
+{
+ plugin_set *set1;
+ plugin_set *set2;
+
+ /* make sure fields are not missed in the code below */
+ static_assert(sizeof *set1 ==
+ sizeof set1->hashval +
+ sizeof set1->link +
+ sizeof set1->file +
+ sizeof set1->dir +
+ sizeof set1->perm +
+ sizeof set1->formatting +
+ sizeof set1->hash +
+ sizeof set1->fibration +
+ sizeof set1->sd +
+ sizeof set1->dir_item +
+ sizeof set1->cipher +
+ sizeof set1->digest +
+ sizeof set1->compression +
+ sizeof set1->compression_mode +
+ sizeof set1->cluster +
+ sizeof set1->create);
+
+ set1 = cast_to(a1);
+ set2 = cast_to(a2);
+ return
+ set1->hashval == set2->hashval &&
+ set1->file == set2->file &&
+ set1->dir == set2->dir &&
+ set1->perm == set2->perm &&
+ set1->formatting == set2->formatting &&
+ set1->hash == set2->hash &&
+ set1->fibration == set2->fibration &&
+ set1->sd == set2->sd &&
+ set1->dir_item == set2->dir_item &&
+ set1->cipher == set2->cipher &&
+ set1->digest == set2->digest &&
+ set1->compression == set2->compression &&
+ set1->compression_mode == set2->compression_mode &&
+ set1->cluster == set2->cluster &&
+ set1->create == set2->create;
+}
+
+#define HASH_FIELD(hash, set, field) \
+({ \
+ (hash) += (unsigned long)(set)->field >> 2; \
+})
+
+static inline unsigned long calculate_hash(const plugin_set * set)
+{
+ unsigned long result;
+
+ result = 0;
+ HASH_FIELD(result, set, file);
+ HASH_FIELD(result, set, dir);
+ HASH_FIELD(result, set, perm);
+ HASH_FIELD(result, set, formatting);
+ HASH_FIELD(result, set, hash);
+ HASH_FIELD(result, set, fibration);
+ HASH_FIELD(result, set, sd);
+ HASH_FIELD(result, set, dir_item);
+ HASH_FIELD(result, set, cipher);
+ HASH_FIELD(result, set, digest);
+ HASH_FIELD(result, set, compression);
+ HASH_FIELD(result, set, compression_mode);
+ HASH_FIELD(result, set, cluster);
+ HASH_FIELD(result, set, create);
+ return result & (PS_TABLE_SIZE - 1);
+}
+
+static inline unsigned long
+pshash(ps_hash_table * table, const unsigned long *a)
+{
+ return *a;
+}
+
+/* The hash table definition */
+#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
+#define KFREE(ptr, size) kfree(ptr)
+TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
+ pseq);
+#undef KFREE
+#undef KMALLOC
+
+static ps_hash_table ps_table;
+static plugin_set empty_set = {
+ .hashval = 0,
+ .file = NULL,
+ .dir = NULL,
+ .perm = NULL,
+ .formatting = NULL,
+ .hash = NULL,
+ .fibration = NULL,
+ .sd = NULL,
+ .dir_item = NULL,
+ .cipher = NULL,
+ .digest = NULL,
+ .compression = NULL,
+ .compression_mode = NULL,
+ .cluster = NULL,
+ .create = NULL,
+ .link = {NULL}
+};
+
+plugin_set *plugin_set_get_empty(void)
+{
+ return &empty_set;
+}
+
+void plugin_set_put(plugin_set * set)
+{
+}
+
+static inline unsigned long *pset_field(plugin_set * set, int offset)
+{
+ return (unsigned long *)(((char *)set) + offset);
+}
+
+static int plugin_set_field(plugin_set ** set, const unsigned long val,
+ const int offset)
+{
+ unsigned long *spot;
+ spinlock_t *lock;
+ plugin_set replica;
+ plugin_set *twin;
+ plugin_set *psal;
+ plugin_set *orig;
+
+ assert("nikita-2902", set != NULL);
+ assert("nikita-2904", *set != NULL);
+
+ spot = pset_field(*set, offset);
+ if (unlikely(*spot == val))
+ return 0;
+
+ replica = *(orig = *set);
+ *pset_field(&replica, offset) = val;
+ replica.hashval = calculate_hash(&replica);
+ rcu_read_lock();
+ twin = ps_hash_find(&ps_table, &replica.hashval);
+ if (unlikely(twin == NULL)) {
+ rcu_read_unlock();
+ psal = kmem_cache_alloc(plugin_set_slab,
+ reiser4_ctx_gfp_mask_get());
+ if (psal == NULL)
+ return RETERR(-ENOMEM);
+ *psal = replica;
+ lock = &plugin_set_lock[replica.hashval & 7];
+ spin_lock(lock);
+ twin = ps_hash_find(&ps_table, &replica.hashval);
+ if (likely(twin == NULL)) {
+ *set = psal;
+ ps_hash_insert_rcu(&ps_table, psal);
+ } else {
+ *set = twin;
+ kmem_cache_free(plugin_set_slab, psal);
+ }
+ spin_unlock(lock);
+ } else {
+ rcu_read_unlock();
+ *set = twin;
+ }
+ return 0;
+}
+
+static struct {
+ int offset;
+ reiser4_plugin_groups groups;
+ reiser4_plugin_type type;
+} pset_descr[PSET_LAST] = {
+ [PSET_FILE] = {
+ .offset = offsetof(plugin_set, file),
+ .type = REISER4_FILE_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_DIR] = {
+ .offset = offsetof(plugin_set, dir),
+ .type = REISER4_DIR_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_PERM] = {
+ .offset = offsetof(plugin_set, perm),
+ .type = REISER4_PERM_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_FORMATTING] = {
+ .offset = offsetof(plugin_set, formatting),
+ .type = REISER4_FORMATTING_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_HASH] = {
+ .offset = offsetof(plugin_set, hash),
+ .type = REISER4_HASH_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_FIBRATION] = {
+ .offset = offsetof(plugin_set, fibration),
+ .type = REISER4_FIBRATION_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_SD] = {
+ .offset = offsetof(plugin_set, sd),
+ .type = REISER4_ITEM_PLUGIN_TYPE,
+ .groups = (1 << STAT_DATA_ITEM_TYPE)
+ },
+ [PSET_DIR_ITEM] = {
+ .offset = offsetof(plugin_set, dir_item),
+ .type = REISER4_ITEM_PLUGIN_TYPE,
+ .groups = (1 << DIR_ENTRY_ITEM_TYPE)
+ },
+ [PSET_CIPHER] = {
+ .offset = offsetof(plugin_set, cipher),
+ .type = REISER4_CIPHER_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_DIGEST] = {
+ .offset = offsetof(plugin_set, digest),
+ .type = REISER4_DIGEST_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_COMPRESSION] = {
+ .offset = offsetof(plugin_set, compression),
+ .type = REISER4_COMPRESSION_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_COMPRESSION_MODE] = {
+ .offset = offsetof(plugin_set, compression_mode),
+ .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_CLUSTER] = {
+ .offset = offsetof(plugin_set, cluster),
+ .type = REISER4_CLUSTER_PLUGIN_TYPE,
+ .groups = 0
+ },
+ [PSET_CREATE] = {
+ .offset = offsetof(plugin_set, create),
+ .type = REISER4_FILE_PLUGIN_TYPE,
+ .groups = (1 << REISER4_REGULAR_FILE)
+ }
+};
+
+#define DEFINE_PSET_OPS(PREFIX) \
+ reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb) \
+{ \
+ if (memb > PSET_LAST) \
+ return REISER4_PLUGIN_TYPES; \
+ return pset_descr[memb].type; \
+} \
+ \
+int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb, \
+ reiser4_plugin * plugin) \
+{ \
+ assert("nikita-3492", set != NULL); \
+ assert("nikita-3493", *set != NULL); \
+ assert("nikita-3494", plugin != NULL); \
+ assert("nikita-3495", 0 <= memb && memb < PSET_LAST); \
+ assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type); \
+ \
+ if (pset_descr[memb].groups) \
+ if (!(pset_descr[memb].groups & plugin->h.groups)) \
+ return -EINVAL; \
+ \
+ return plugin_set_field(set, \
+ (unsigned long)plugin, pset_descr[memb].offset); \
+} \
+ \
+reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb) \
+{ \
+ assert("nikita-3497", set != NULL); \
+ assert("nikita-3498", 0 <= memb && memb < PSET_LAST); \
+ \
+ return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
+}
+
+DEFINE_PSET_OPS(aset);
+
+int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
+{
+ return plugin_set_field(set,
+ (unsigned long)plugin, pset_descr[memb].offset);
+}
+
+/**
+ * init_plugin_set - create plugin set cache and hash table
+ *
+ * Initializes slab cache of plugin_set-s and their hash table. It is part of
+ * reiser4 module initialization.
+ */
+int init_plugin_set(void)
+{
+ int result;
+
+ result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
+ if (result == 0) {
+ plugin_set_slab = kmem_cache_create("plugin_set",
+ sizeof(plugin_set), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (plugin_set_slab == NULL)
+ result = RETERR(-ENOMEM);
+ }
+ return result;
+}
+
+/**
+ * done_plugin_set - delete plugin_set cache and plugin_set hash table
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void done_plugin_set(void)
+{
+ plugin_set *cur, *next;
+
+ for_all_in_htable(&ps_table, ps, cur, next) {
+ ps_hash_remove(&ps_table, cur);
+ kmem_cache_free(plugin_set_slab, cur);
+ }
+ destroy_reiser4_cache(&plugin_set_slab);
+ ps_hash_done(&ps_table);
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/plugin_set.h linux-5.10.2/fs/reiser4/plugin/plugin_set.h
--- linux-5.10.2.orig/fs/reiser4/plugin/plugin_set.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/plugin_set.h 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,78 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Reiser4 plugin set definition.
+ See fs/reiser4/plugin/plugin_set.c for details */
+
+#if !defined(__PLUGIN_SET_H__)
+#define __PLUGIN_SET_H__
+
+#include "../type_safe_hash.h"
+#include "plugin.h"
+
+#include <linux/rcupdate.h>
+
+struct plugin_set;
+typedef struct plugin_set plugin_set;
+
+TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
+
+struct plugin_set {
+ unsigned long hashval;
+ /* plugin of file */
+ file_plugin *file;
+ /* plugin of dir */
+ dir_plugin *dir;
+ /* perm plugin for this file */
+ perm_plugin *perm;
+ /* tail policy plugin. Only meaningful for regular files */
+ formatting_plugin *formatting;
+ /* hash plugin. Only meaningful for directories. */
+ hash_plugin *hash;
+ /* fibration plugin. Only meaningful for directories. */
+ fibration_plugin *fibration;
+ /* plugin of stat-data */
+ item_plugin *sd;
+ /* plugin of items a directory is built of */
+ item_plugin *dir_item;
+ /* cipher plugin */
+ cipher_plugin *cipher;
+ /* digest plugin */
+ digest_plugin *digest;
+ /* compression plugin */
+ compression_plugin *compression;
+ /* compression mode plugin */
+ compression_mode_plugin *compression_mode;
+ /* cluster plugin */
+ cluster_plugin *cluster;
+ /* this specifies file plugin of regular children.
+ only meaningful for directories */
+ file_plugin *create;
+ ps_hash_link link;
+};
+
+extern plugin_set *plugin_set_get_empty(void);
+extern void plugin_set_put(plugin_set * set);
+
+extern int init_plugin_set(void);
+extern void done_plugin_set(void);
+
+extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
+extern int set_plugin(plugin_set ** set, pset_member memb,
+ reiser4_plugin * plugin);
+extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
+ reiser4_plugin * plugin);
+extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
+
+/* __PLUGIN_SET_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/regular.c linux-5.10.2/fs/reiser4/plugin/regular.c
--- linux-5.10.2.orig/fs/reiser4/plugin/regular.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/regular.c 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,55 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Contains Reiser4 regular plugins which:
+ . specify a set of reiser4 regular object plugins,
+ . used by directory plugin to create entries powered by specified
+ regular plugins */
+
+#include "plugin.h"
+
+regular_plugin regular_plugins[LAST_REGULAR_ID] = {
+ [UF_REGULAR_ID] = {
+ .h = {
+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
+ .id = UF_REGULAR_ID,
+ .pops = NULL,
+ .label = "unixfile",
+ .desc = "Unix file regular plugin",
+ .linkage = {NULL, NULL}
+ },
+ .id = UNIX_FILE_PLUGIN_ID
+ },
+ [CRC_REGULAR_ID] = {
+ .h = {
+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
+ .id = CRC_REGULAR_ID,
+ .pops = NULL,
+ .label = "cryptcompress",
+ .desc = "Cryptcompress regular plugin",
+ .linkage = {NULL, NULL}
+ },
+ .id = CRC_FILE_PLUGIN_ID
+ },
+ [SF_REGULAR_ID] = {
+ .h = {
+ .type_id = REISER4_REGULAR_PLUGIN_TYPE,
+ .id = SF_REGULAR_ID,
+ .pops = NULL,
+ .label = "striped-file",
+ .desc = "Striped regular plugin",
+ .linkage = {NULL, NULL}
+ },
+ .id = SF_FILE_PLUGIN_ID
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/security/Makefile linux-5.10.2/fs/reiser4/plugin/security/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/security/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/security/Makefile 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,4 @@
+obj-$(CONFIG_REISER4_FS) += security_plugins.o
+
+security_plugins-objs := \
+ perm.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/security/perm.c linux-5.10.2/fs/reiser4/plugin/security/perm.c
--- linux-5.10.2.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/security/perm.c 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,33 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/*
+ * This file contains implementation of permission plugins.
+ * See the comments in perm.h
+ */
+
+#include "../plugin.h"
+#include "../plugin_header.h"
+#include "../../debug.h"
+
+perm_plugin perm_plugins[LAST_PERM_ID] = {
+ [NULL_PERM_ID] = {
+ .h = {
+ .type_id = REISER4_PERM_PLUGIN_TYPE,
+ .id = NULL_PERM_ID,
+ .pops = NULL,
+ .label = "null",
+ .desc = "stub permission plugin",
+ .linkage = {NULL, NULL}
+ }
+ }
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/security/perm.h linux-5.10.2/fs/reiser4/plugin/security/perm.h
--- linux-5.10.2.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/security/perm.h 2020-12-23 16:07:46.131813319 +0100
@@ -0,0 +1,38 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Perm (short for "permissions") plugins common stuff. */
+
+#if !defined( __REISER4_PERM_H__ )
+#define __REISER4_PERM_H__
+
+#include "../../forward.h"
+#include "../plugin_header.h"
+
+#include <linux/types.h>
+
+/* Definition of permission plugin */
+/* NIKITA-FIXME-HANS: define what this is targeted for.
+ It does not seem to be intended for use with sys_reiser4. Explain. */
+
+/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
+ Consider it like a temporary "seam" and reserved pset member.
+ If you have something usefull to add, then rename this plugin and add here */
+typedef struct perm_plugin {
+ /* generic plugin fields */
+ plugin_header h;
+} perm_plugin;
+
+typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
+
+/* __REISER4_PERM_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/space/bitmap.c linux-5.10.2/fs/reiser4/plugin/space/bitmap.c
--- linux-5.10.2.orig/fs/reiser4/plugin/space/bitmap.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/space/bitmap.c 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,1649 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#include "../../debug.h"
+#include "../../dformat.h"
+#include "../../txnmgr.h"
+#include "../../jnode.h"
+#include "../../block_alloc.h"
+#include "../../tree.h"
+#include "../../super.h"
+#include "../plugin.h"
+#include "space_allocator.h"
+#include "bitmap.h"
+
+#include <linux/types.h>
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/mutex.h>
+#include <asm/div64.h>
+
+/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
+ * blocks
+
+ A useful optimization of reiser4 bitmap handling would be dynamic bitmap
+ blocks loading/unloading which is different from v3.x where all bitmap
+ blocks are loaded at mount time.
+
+ To implement bitmap blocks unloading we need to count bitmap block usage
+ and detect currently unused blocks allowing them to be unloaded. It is not
+ a simple task since we allow several threads to modify one bitmap block
+ simultaneously.
+
+ Briefly speaking, the following schema is proposed: we count in special
+ variable associated with each bitmap block. That is for counting of block
+ alloc/dealloc operations on that bitmap block. With a deferred block
+ deallocation feature of reiser4 all those operation will be represented in
+ atom dirty/deleted lists as jnodes for freshly allocated or deleted
+ nodes.
+
+ So, we increment usage counter for each new node allocated or deleted, and
+ decrement it at atom commit one time for each node from the dirty/deleted
+ atom's list. Of course, freshly allocated node deletion and node reusing
+ from atom deleted (if we do so) list should decrement bitmap usage counter
+ also.
+
+ This schema seems to be working but that reference counting is
+ not easy to debug. I think we should agree with Hans and do not implement
+ it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
+
+ For simplicity all bitmap nodes (both commit and working bitmap blocks) are
+ loaded into memory on fs mount time or each bitmap nodes are loaded at the
+ first access to it, the "dont_load_bitmap" mount option controls whether
+ bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
+ nodes currently is not supported. */
+
+#define CHECKSUM_SIZE 4
+
+#define BYTES_PER_LONG (sizeof(long))
+
+#if BITS_PER_LONG == 64
+# define LONG_INT_SHIFT (6)
+#else
+# define LONG_INT_SHIFT (5)
+#endif
+
+#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
+
+typedef unsigned long ulong_t;
+
+#define bmap_size(blocksize) ((blocksize) - CHECKSUM_SIZE)
+#define bmap_bit_count(blocksize) (bmap_size(blocksize) << 3)
+
+/* Block allocation/deallocation are done through special bitmap objects which
+ are allocated in an array at fs mount. */
+struct bitmap_node {
+ struct mutex mutex; /* long term lock object */
+
+ jnode *wjnode; /* j-nodes for WORKING ... */
+ jnode *cjnode; /* ... and COMMIT bitmap blocks */
+
+ bmap_off_t first_zero_bit; /* for skip_busy option implementation */
+
+ atomic_t loaded; /* a flag which shows that bnode is loaded
+ * already */
+};
+
+static inline char *bnode_working_data(struct bitmap_node *bnode)
+{
+ char *data;
+
+ data = jdata(bnode->wjnode);
+ assert("zam-429", data != NULL);
+
+ return data + CHECKSUM_SIZE;
+}
+
+static inline char *bnode_commit_data(const struct bitmap_node *bnode)
+{
+ char *data;
+
+ data = jdata(bnode->cjnode);
+ assert("zam-430", data != NULL);
+
+ return data + CHECKSUM_SIZE;
+}
+
+static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
+{
+ char *data;
+
+ data = jdata(bnode->cjnode);
+ assert("vpf-261", data != NULL);
+
+ return le32_to_cpu(get_unaligned((d32 *)data));
+}
+
+static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
+{
+ char *data;
+
+ data = jdata(bnode->cjnode);
+ assert("vpf-261", data != NULL);
+
+ put_unaligned(cpu_to_le32(crc), (d32 *)data);
+}
+
+/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
+ * written the code, does this added abstraction still have */
+/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
+ * reiser4_space_allocator structure) */
+/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
+/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
+ * someday?". What they about? If there is a reason to have a union, it should
+ * be a union, if not, it should not be a union. "..might be someday" means no
+ * reason. */
+struct bitmap_allocator_data {
+ /* an array for bitmap blocks direct access */
+ struct bitmap_node *bitmap;
+};
+
+#define get_barray(subvol) \
+(((struct bitmap_allocator_data *)(subvol->space_allocator.u.generic))->bitmap)
+
+#define get_bnode(subvol, i) (get_barray(subvol) + i)
+
+/* allocate and initialize jnode with JNODE_BITMAP type */
+static jnode *bnew(reiser4_subvol *subvol)
+{
+ jnode *jal = jalloc();
+
+ if (jal)
+ jnode_init(jal, subvol, JNODE_BITMAP);
+ return jal;
+}
+
+/* this file contains:
+ - bitmap based implementation of space allocation plugin
+ - all the helper functions like set bit, find_first_zero_bit, etc */
+
+/* Audited by: green(2002.06.12) */
+static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
+{
+ ulong_t mask = 1UL << start_bit;
+ int i = start_bit;
+
+ while ((word & mask) != 0) {
+ mask <<= 1;
+ if (++i >= BITS_PER_LONG)
+ break;
+ }
+
+ return i;
+}
+
+#include <linux/bitops.h>
+
+#if BITS_PER_LONG == 64
+
+#define OFF(addr) (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
+#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
+
+static inline void reiser4_set_bit(int nr, void *addr)
+{
+ __test_and_set_bit_le(nr + OFF(addr), BASE(addr));
+}
+
+static inline void reiser4_clear_bit(int nr, void *addr)
+{
+ __test_and_clear_bit_le(nr + OFF(addr), BASE(addr));
+}
+
+static inline int reiser4_test_bit(int nr, void *addr)
+{
+ return test_bit_le(nr + OFF(addr), BASE(addr));
+}
+static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
+ int offset)
+{
+ int off = OFF(addr);
+
+ return find_next_zero_bit_le(BASE(addr), maxoffset + off,
+ offset + off) - off;
+}
+
+#else
+
+#define reiser4_set_bit(nr, addr) __test_and_set_bit_le(nr, addr)
+#define reiser4_clear_bit(nr, addr) __test_and_clear_bit_le(nr, addr)
+#define reiser4_test_bit(nr, addr) test_bit_le(nr, addr)
+
+#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
+find_next_zero_bit_le(addr, maxoffset, offset)
+#endif
+
+/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
+ * are counted from @addr, return the offset of the first bit if it is found,
+ * @maxoffset otherwise. */
+static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
+ bmap_off_t start_offset)
+{
+ ulong_t *base = addr;
+ /* start_offset is in bits, convert it to byte offset within bitmap. */
+ int word_nr = start_offset >> LONG_INT_SHIFT;
+ /* bit number within the byte. */
+ int bit_nr = start_offset & LONG_INT_MASK;
+ int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
+
+ assert("zam-387", max_offset != 0);
+
+ /* Unaligned @start_offset case. */
+ if (bit_nr != 0) {
+ bmap_nr_t nr;
+
+ nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
+
+ if (nr < BITS_PER_LONG)
+ return (word_nr << LONG_INT_SHIFT) + nr;
+
+ ++word_nr;
+ }
+
+ /* Fast scan trough aligned words. */
+ while (word_nr <= max_word_nr) {
+ if (base[word_nr] != 0) {
+ return (word_nr << LONG_INT_SHIFT)
+ + find_next_zero_bit_in_word(~(base[word_nr]), 0);
+ }
+
+ ++word_nr;
+ }
+
+ return max_offset;
+}
+
+#if BITS_PER_LONG == 64
+
+static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
+ bmap_off_t start_offset)
+{
+ bmap_off_t off = OFF(addr);
+
+ return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
+ start_offset + off) - off;
+}
+
+#else
+#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
+ __reiser4_find_next_set_bit(addr, max_offset, start_offset)
+#endif
+
+/* search for the first set bit in single word. */
+static int find_last_set_bit_in_word(ulong_t word, int start_bit)
+{
+ ulong_t bit_mask;
+ int nr = start_bit;
+
+ assert("zam-965", start_bit < BITS_PER_LONG);
+ assert("zam-966", start_bit >= 0);
+
+ bit_mask = (1UL << nr);
+
+ while (bit_mask != 0) {
+ if (bit_mask & word)
+ return nr;
+ bit_mask >>= 1;
+ nr--;
+ }
+ return BITS_PER_LONG;
+}
+
+/* Search bitmap for a set bit in backward direction from the end to the
+ * beginning of given region
+ *
+ * @result: result offset of the last set bit
+ * @addr: base memory address,
+ * @low_off: low end of the search region, edge bit included into the region,
+ * @high_off: high end of the search region, edge bit included into the region,
+ *
+ * @return: 0 - set bit was found, -1 otherwise.
+ */
+static int
+reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
+ bmap_off_t high_off)
+{
+ ulong_t *base = addr;
+ int last_word;
+ int first_word;
+ int last_bit;
+ int nr;
+
+ assert("zam-962", high_off >= low_off);
+
+ last_word = high_off >> LONG_INT_SHIFT;
+ last_bit = high_off & LONG_INT_MASK;
+ first_word = low_off >> LONG_INT_SHIFT;
+
+ if (last_bit < BITS_PER_LONG) {
+ nr = find_last_set_bit_in_word(base[last_word], last_bit);
+ if (nr < BITS_PER_LONG) {
+ *result = (last_word << LONG_INT_SHIFT) + nr;
+ return 0;
+ }
+ --last_word;
+ }
+ while (last_word >= first_word) {
+ if (base[last_word] != 0x0) {
+ last_bit =
+ find_last_set_bit_in_word(base[last_word],
+ BITS_PER_LONG - 1);
+ assert("zam-972", last_bit < BITS_PER_LONG);
+ *result = (last_word << LONG_INT_SHIFT) + last_bit;
+ return 0;
+ }
+ --last_word;
+ }
+
+ return -1; /* set bit not found */
+}
+
+/* Search bitmap for a clear bit in backward direction from the end to the
+ * beginning of given region */
+static int
+reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
+ bmap_off_t high_off)
+{
+ ulong_t *base = addr;
+ int last_word;
+ int first_word;
+ int last_bit;
+ int nr;
+
+ last_word = high_off >> LONG_INT_SHIFT;
+ last_bit = high_off & LONG_INT_MASK;
+ first_word = low_off >> LONG_INT_SHIFT;
+
+ if (last_bit < BITS_PER_LONG) {
+ nr = find_last_set_bit_in_word(~base[last_word], last_bit);
+ if (nr < BITS_PER_LONG) {
+ *result = (last_word << LONG_INT_SHIFT) + nr;
+ return 0;
+ }
+ --last_word;
+ }
+ while (last_word >= first_word) {
+ if (base[last_word] != (ulong_t) (-1)) {
+ *result = (last_word << LONG_INT_SHIFT) +
+ find_last_set_bit_in_word(~base[last_word],
+ BITS_PER_LONG - 1);
+ return 0;
+ }
+ --last_word;
+ }
+
+ return -1; /* zero bit not found */
+}
+
+/* Audited by: green(2002.06.12) */
+static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
+{
+ int first_byte;
+ int last_byte;
+
+ unsigned char first_byte_mask = 0xFF;
+ unsigned char last_byte_mask = 0xFF;
+
+ assert("zam-410", start < end);
+
+ first_byte = start >> 3;
+ last_byte = (end - 1) >> 3;
+
+ if (last_byte > first_byte + 1)
+ memset(addr + first_byte + 1, 0,
+ (size_t) (last_byte - first_byte - 1));
+
+ first_byte_mask >>= 8 - (start & 0x7);
+ last_byte_mask <<= ((end - 1) & 0x7) + 1;
+
+ if (first_byte == last_byte) {
+ addr[first_byte] &= (first_byte_mask | last_byte_mask);
+ } else {
+ addr[first_byte] &= first_byte_mask;
+ addr[last_byte] &= last_byte_mask;
+ }
+}
+
+/* Audited by: green(2002.06.12) */
+/* ZAM-FIXME-HANS: comment this */
+static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
+{
+ int first_byte;
+ int last_byte;
+
+ unsigned char first_byte_mask = 0xFF;
+ unsigned char last_byte_mask = 0xFF;
+
+ assert("zam-386", start < end);
+
+ first_byte = start >> 3;
+ last_byte = (end - 1) >> 3;
+
+ if (last_byte > first_byte + 1)
+ memset(addr + first_byte + 1, 0xFF,
+ (size_t) (last_byte - first_byte - 1));
+
+ first_byte_mask <<= start & 0x7;
+ last_byte_mask >>= 7 - ((end - 1) & 0x7);
+
+ if (first_byte == last_byte) {
+ addr[first_byte] |= (first_byte_mask & last_byte_mask);
+ } else {
+ addr[first_byte] |= first_byte_mask;
+ addr[last_byte] |= last_byte_mask;
+ }
+}
+
+#define ADLER_BASE 65521
+#define ADLER_NMAX 5552
+
+/* Calculates the adler32 checksum for the data pointed by `data` of the
+ length `len`. This function was originally taken from zlib, version 1.1.3,
+ July 9th, 1998.
+
+ Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup@gzip.org madler@alumni.caltech.edu
+
+ The above comment applies only to the reiser4_adler32 function.
+*/
+
+__u32 reiser4_adler32(char *data, __u32 len)
+{
+ unsigned char *t = data;
+ __u32 s1 = 1;
+ __u32 s2 = 0;
+ int k;
+
+ while (len > 0) {
+ k = len < ADLER_NMAX ? len : ADLER_NMAX;
+ len -= k;
+
+ while (k--) {
+ s1 += *t++;
+ s2 += s1;
+ }
+
+ s1 %= ADLER_BASE;
+ s2 %= ADLER_BASE;
+ }
+ return (s2 << 16) | s1;
+}
+
+#define subvol_by_bnode(bnode) (jnode_get_subvol(bnode->wjnode))
+
+static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
+{
+ return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
+}
+
+static int
+bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
+{
+ if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
+ bmap_nr_t bmap;
+
+ bmap = bnode - get_bnode(subvol_by_bnode(bnode), 0);
+
+ warning("vpf-263",
+ "Checksum for the bitmap block %llu is incorrect",
+ bmap);
+
+ return RETERR(-EIO);
+ }
+
+ return 0;
+}
+
+#define REISER4_CHECK_BMAP_CRC (0)
+
+#if REISER4_CHECK_BMAP_CRC
+static int bnode_check_crc(const struct bitmap_node *bnode)
+{
+ return bnode_check_adler32(bnode,
+ bmap_size(subvol_by_bnode(bnode)->s_blocksize));
+}
+
+/* REISER4_CHECK_BMAP_CRC */
+#else
+
+#define bnode_check_crc(bnode) (0)
+
+/* REISER4_CHECK_BMAP_CRC */
+#endif
+
+/* Recalculates the adler32 checksum for only 1 byte change.
+ adler - previous adler checksum
+ old_data, data - old, new byte values.
+ tail == (chunk - offset) : length, checksum was calculated for, - offset of
+ the changed byte within this chunk.
+ This function can be used for checksum calculation optimisation.
+*/
+
+static __u32
+adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
+ __u32 tail)
+{
+ __u32 delta = data - old_data + 2 * ADLER_BASE;
+ __u32 s1 = adler & 0xffff;
+ __u32 s2 = (adler >> 16) & 0xffff;
+
+ s1 = (delta + s1) % ADLER_BASE;
+ s2 = (delta * tail + s2) % ADLER_BASE;
+
+ return (s2 << 16) | s1;
+}
+
+#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
+
+/**
+ * get_nr_bmap - calculate number of bitmap blocks
+ * @super: super block with initialized blocksize and block count
+ *
+ * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
+ * maintain free disk space. It assumes that each bitmap addresses the same
+ * number of blocks which is calculated by bmap_block_count macro defined in
+ * above. Number of blocks in the subvolume has to be initialized in struct
+ * reiser4_subvol already so that it can be obtained via
+ * reiser4_subvol_block_count(). Unfortunately, number of blocks addressed
+ * by a bitmap is not power of 2 because 4 bytes are used for checksum.
+ * Therefore, we have to use special function to divide and modulo 64bits
+ * filesystem block counters.
+ *
+ * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
+ * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
+ * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
+ */
+static bmap_nr_t get_nr_bmap(const struct super_block *super,
+ reiser4_subvol *subv)
+{
+ u64 quotient;
+
+ assert("zam-393", reiser4_subvol_block_count(subv) != 0);
+
+ quotient = reiser4_subvol_block_count(subv) - 1;
+ do_div(quotient, bmap_bit_count(super->s_blocksize));
+ return quotient + 1;
+}
+
+/**
+ * parse_blocknr - calculate bitmap number and offset in it by block number
+ * @block: pointer to block number to calculate location in bitmap of
+ * @bmap: pointer where to store bitmap block number
+ * @offset: pointer where to store offset within bitmap block
+ * @subv: subvolume where the block is located in
+ *
+ * Calculates location of bit which is responsible for allocation/freeing of
+ * block @*block. That location is represented by bitmap block number and offset
+ * within that bitmap block.
+ */
+static void parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
+ bmap_off_t *offset, reiser4_subvol *subv)
+{
+ struct super_block *super = get_current_context()->super;
+ u64 quotient = *block;
+
+ *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
+ *bmap = quotient;
+
+ assert("zam-433", *bmap < get_nr_bmap(super, subv));
+ assert("", *offset < bmap_bit_count(super->s_blocksize));
+}
+
+#if REISER4_DEBUG
+static void check_block_range(const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ reiser4_subvol *subv)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+
+ assert("zam-436", sb != NULL);
+ assert("zam-455", start != NULL);
+ assert("zam-437", *start != 0);
+ assert("zam-541", !reiser4_blocknr_is_fake(start));
+ assert("zam-441", *start < reiser4_subvol_block_count(subv));
+
+ if (len != NULL) {
+ assert("zam-438", *len != 0);
+ assert("zam-442",
+ *start + *len <= reiser4_subvol_block_count(subv));
+ }
+}
+
+static void check_bnode_loaded(const struct bitmap_node *bnode)
+{
+ assert("zam-485", bnode != NULL);
+ assert("zam-483", jnode_page(bnode->wjnode) != NULL);
+ assert("zam-484", jnode_page(bnode->cjnode) != NULL);
+ assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
+ assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
+}
+#else
+
+#define check_block_range(start, len, subv) do { /* nothing */} while(0)
+#define check_bnode_loaded(bnode) do { /* nothing */} while(0)
+
+#endif
+
+/* modify bnode->first_zero_bit (if we free bits before); bnode should be
+ spin-locked */
+static inline void
+adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
+{
+ if (offset < bnode->first_zero_bit)
+ bnode->first_zero_bit = offset;
+}
+
+/* return a physical disk address for logical bitmap number @bmap */
+/* FIXME-VS: this is somehow related to disk layout? */
+/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
+ * per block allocation so that performance is not affected. Probably this
+ * whole file should be considered part of the disk layout plugin, and other
+ * disk layouts can use other defines and efficiency will not be significantly
+ * affected. */
+
+#define REISER4_FIRST_BITMAP_BLOCK \
+ ((REISER4_MASTER_OFFSET / PAGE_SIZE) + 2)
+
+/* Audited by: green(2002.06.12) */
+static void
+get_bitmap_blocknr(reiser4_subvol *subv, bmap_nr_t bmap,
+ reiser4_block_nr * bnr)
+{
+ struct super_block *super = subv->super;
+
+ assert("zam-390", bmap < get_nr_bmap(super, subv));
+
+#ifdef CONFIG_REISER4_BADBLOCKS
+#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
+ /* Check if the diskmap have this already, first. */
+ if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
+ return; /* Found it in diskmap */
+#endif
+ /* FIXME_ZAM: before discussing of disk layouts and disk format
+ plugins I implement bitmap location scheme which is close to scheme
+ used in reiser 3.6 */
+ if (bmap == 0) {
+ *bnr = REISER4_FIRST_BITMAP_BLOCK;
+ } else {
+ *bnr = bmap * bmap_bit_count(super->s_blocksize);
+ }
+}
+
+/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
+/* Audited by: green(2002.06.12) */
+static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
+{
+ *bnr =
+ (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
+ REISER4_BITMAP_BLOCKS_STATUS_VALUE);
+}
+
+/* bnode structure initialization */
+static void init_bnode(struct bitmap_node *bnode, bmap_nr_t bmap UNUSED_ARG)
+{
+ memset(bnode, 0, sizeof(struct bitmap_node));
+
+ mutex_init(&bnode->mutex);
+ atomic_set(&bnode->loaded, 0);
+}
+
+static void release(jnode * node)
+{
+ jrelse(node);
+ JF_SET(node, JNODE_HEARD_BANSHEE);
+ jput(node);
+}
+
+/* This function is for internal bitmap.c use because it assumes that jnode is
+ in under full control of this thread */
+static void done_bnode(struct bitmap_node *bnode)
+{
+ if (bnode) {
+ atomic_set(&bnode->loaded, 0);
+ if (bnode->wjnode != NULL)
+ release(bnode->wjnode);
+ if (bnode->cjnode != NULL)
+ release(bnode->cjnode);
+ bnode->wjnode = bnode->cjnode = NULL;
+ }
+}
+
+/*
+ * ZAM-FIXME-HANS: comment this. Called only by load_and_lock_bnode()
+ */
+static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
+ jnode **wjnode_ret, reiser4_subvol *subv)
+{
+ struct super_block *super;
+ jnode *cjnode;
+ jnode *wjnode;
+ bmap_nr_t bmap;
+ int ret;
+
+ super = reiser4_get_current_sb();
+
+ *wjnode_ret = wjnode = bnew(subv);
+ if (wjnode == NULL) {
+ *cjnode_ret = NULL;
+ return RETERR(-ENOMEM);
+ }
+
+ *cjnode_ret = cjnode = bnew(subv);
+ if (cjnode == NULL)
+ return RETERR(-ENOMEM);
+
+ bmap = bnode - get_bnode(subv, 0);
+
+ get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
+ get_bitmap_blocknr(subv, bmap, &cjnode->blocknr);
+
+ jref(cjnode);
+ jref(wjnode);
+
+ /* load commit bitmap */
+ ret = jload_gfp(cjnode, GFP_NOFS, 1);
+
+ if (ret)
+ goto error;
+
+ /* allocate memory for working bitmap block. Note that for
+ * bitmaps jinit_new() doesn't actually modifies node content,
+ * so parallel calls to this are ok. */
+ ret = jinit_new(wjnode, GFP_NOFS);
+
+ if (ret != 0) {
+ jrelse(cjnode);
+ goto error;
+ }
+
+ return 0;
+
+ error:
+ jput(cjnode);
+ jput(wjnode);
+ *wjnode_ret = *cjnode_ret = NULL;
+ return ret;
+
+}
+
+/* Check the bnode data on read. */
+static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
+{
+ void *data;
+ int ret;
+
+ /* Check CRC */
+ ret = bnode_check_adler32(bnode, blksize);
+
+ if (ret) {
+ return ret;
+ }
+
+ data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
+
+ /* Check the very first bit -- it must be busy. */
+ if (!reiser4_test_bit(0, data)) {
+ warning("vpf-1362", "The allocator block %llu is not marked "
+ "as used.", (unsigned long long)bnode->cjnode->blocknr);
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* load bitmap blocks "on-demand" */
+static int load_and_lock_bnode(struct bitmap_node *bnode,
+ reiser4_subvol *subvol)
+{
+ int ret;
+
+ jnode *cjnode;
+ jnode *wjnode;
+
+ assert("nikita-3040", reiser4_schedulable());
+
+/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
+ * need to be atomic, right? Just leave a comment that if bitmaps were
+ * unloadable, this would need to be atomic. */
+ if (atomic_read(&bnode->loaded)) {
+ /* bitmap is already loaded, nothing to do */
+ check_bnode_loaded(bnode);
+ mutex_lock(&bnode->mutex);
+ assert("nikita-2827", atomic_read(&bnode->loaded));
+ return 0;
+ }
+ ret = prepare_bnode(bnode, &cjnode, &wjnode, subvol);
+ if (ret)
+ return ret;
+
+ mutex_lock(&bnode->mutex);
+
+ if (!atomic_read(&bnode->loaded)) {
+ assert("nikita-2822", cjnode != NULL);
+ assert("nikita-2823", wjnode != NULL);
+ assert("nikita-2824", jnode_is_loaded(cjnode));
+ assert("nikita-2825", jnode_is_loaded(wjnode));
+
+ bnode->wjnode = wjnode;
+ bnode->cjnode = cjnode;
+
+ ret = check_struct_bnode(bnode, current_blocksize);
+ if (unlikely(ret != 0))
+ goto error;
+
+ atomic_set(&bnode->loaded, 1);
+ /* working bitmap is initialized by on-disk
+ * commit bitmap. This should be performed
+ * under mutex. */
+ memcpy(bnode_working_data(bnode),
+ bnode_commit_data(bnode),
+ bmap_size(current_blocksize));
+ } else
+ /* race: someone already loaded bitmap
+ * while we were busy initializing data. */
+ check_bnode_loaded(bnode);
+ return 0;
+
+ error:
+ release(wjnode);
+ release(cjnode);
+ bnode->wjnode = NULL;
+ bnode->cjnode = NULL;
+ mutex_unlock(&bnode->mutex);
+ return ret;
+}
+
+static void release_and_unlock_bnode(struct bitmap_node *bnode)
+{
+ check_bnode_loaded(bnode);
+ mutex_unlock(&bnode->mutex);
+}
+
+/* This function does all block allocation work but only for one bitmap
+ block.*/
+/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
+ block responsibility zone boundaries. This had no sense in v3.6 but may
+ have it in v4.x */
+/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
+
+static int search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t *offset,
+ bmap_off_t max_offset, int min_len,
+ int max_len, reiser4_subvol *subv)
+{
+ struct bitmap_node *bnode = get_bnode(subv, bmap);
+
+ char *data;
+
+ bmap_off_t search_end;
+ bmap_off_t start;
+ bmap_off_t end;
+
+ int set_first_zero_bit = 0;
+
+ int ret;
+
+ assert("zam-364", min_len > 0);
+ assert("zam-365", max_len >= min_len);
+ assert("zam-366", *offset <= max_offset);
+
+ ret = load_and_lock_bnode(bnode, subv);
+
+ if (ret)
+ return ret;
+
+ data = bnode_working_data(bnode);
+
+ start = *offset;
+
+ if (bnode->first_zero_bit >= start) {
+ start = bnode->first_zero_bit;
+ set_first_zero_bit = 1;
+ }
+
+ while (start + min_len < max_offset) {
+
+ start =
+ reiser4_find_next_zero_bit((long *)data, max_offset, start);
+ if (set_first_zero_bit) {
+ bnode->first_zero_bit = start;
+ set_first_zero_bit = 0;
+ }
+ if (start >= max_offset)
+ break;
+
+ search_end = LIMIT(start + max_len, max_offset);
+ end =
+ reiser4_find_next_set_bit((long *)data, search_end, start);
+ if (end >= start + min_len) {
+ /* we can't trust find_next_set_bit result if set bit
+ was not fount, result may be bigger than
+ max_offset */
+ if (end > search_end)
+ end = search_end;
+
+ ret = end - start;
+ *offset = start;
+
+ reiser4_set_bits(data, start, end);
+
+ /* FIXME: we may advance first_zero_bit if [start,
+ end] region overlaps the first_zero_bit point */
+
+ break;
+ }
+
+ start = end + 1;
+ }
+
+ release_and_unlock_bnode(bnode);
+
+ return ret;
+}
+
+static int search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t *start_offset,
+ bmap_off_t end_offset, int min_len,
+ int max_len, reiser4_subvol *subv)
+{
+ struct bitmap_node *bnode = get_bnode(subv, bmap);
+ char *data;
+ bmap_off_t start;
+ int ret;
+
+ assert("zam-958", min_len > 0);
+ assert("zam-959", max_len >= min_len);
+ assert("zam-960", *start_offset >= end_offset);
+
+ ret = load_and_lock_bnode(bnode, subv);
+ if (ret)
+ return ret;
+
+ data = bnode_working_data(bnode);
+ start = *start_offset;
+
+ while (1) {
+ bmap_off_t end, search_end;
+
+ /* Find the beginning of the zero filled region */
+ if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
+ break;
+ /* Is there more than `min_len' bits from `start' to
+ * `end_offset'? */
+ if (start < end_offset + min_len - 1)
+ break;
+
+ /* Do not search to `end_offset' if we need to find less than
+ * `max_len' zero bits. */
+ if (end_offset + max_len - 1 < start)
+ search_end = start - max_len + 1;
+ else
+ search_end = end_offset;
+
+ if (reiser4_find_last_set_bit(&end, data, search_end, start))
+ end = search_end;
+ else
+ end++;
+
+ if (end + min_len <= start + 1) {
+ if (end < search_end)
+ end = search_end;
+ ret = start - end + 1;
+ *start_offset = end; /* `end' is lowest offset */
+ assert("zam-987",
+ reiser4_find_next_set_bit(data, start + 1,
+ end) >= start + 1);
+ reiser4_set_bits(data, end, start + 1);
+ break;
+ }
+
+ if (end <= end_offset)
+ /* left search boundary reached. */
+ break;
+ start = end - 1;
+ }
+
+ release_and_unlock_bnode(bnode);
+ return ret;
+}
+
+/* allocate contiguous range of blocks in bitmap */
+static int bitmap_alloc_forward(reiser4_block_nr *start,
+ const reiser4_block_nr *end,
+ int min_len, int max_len,
+ reiser4_subvol *subv)
+{
+ bmap_nr_t bmap, end_bmap;
+ bmap_off_t offset, end_offset;
+ int len;
+
+ reiser4_block_nr tmp;
+
+ struct super_block *super = get_current_context()->super;
+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
+
+ parse_blocknr(start, &bmap, &offset, subv);
+
+ tmp = *end - 1;
+ parse_blocknr(&tmp, &end_bmap, &end_offset, subv);
+ ++end_offset;
+
+ assert("zam-358", end_bmap >= bmap);
+ assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
+
+ for (; bmap < end_bmap; bmap++, offset = 0) {
+ len = search_one_bitmap_forward(bmap, &offset, max_offset,
+ min_len, max_len, subv);
+ if (len != 0)
+ goto out;
+ }
+
+ len = search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
+ max_len, subv);
+ out:
+ *start = bmap * max_offset + offset;
+ return len;
+}
+
+/*
+ * allocate contiguous range of blocks in bitmap (from @start to @end in
+ * backward direction)
+ */
+static int bitmap_alloc_backward(reiser4_block_nr *start,
+ const reiser4_block_nr *end, int min_len,
+ int max_len, reiser4_subvol *subv)
+{
+ bmap_nr_t bmap, end_bmap;
+ bmap_off_t offset, end_offset;
+ int len;
+ struct super_block *super = get_current_context()->super;
+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
+
+ parse_blocknr(start, &bmap, &offset, subv);
+ parse_blocknr(end, &end_bmap, &end_offset, subv);
+
+ assert("zam-961", end_bmap <= bmap);
+ assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
+
+ for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
+ len = search_one_bitmap_backward(bmap, &offset, 0, min_len,
+ max_len, subv);
+ if (len != 0)
+ goto out;
+ }
+ len = search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
+ max_len, subv);
+ out:
+ *start = bmap * max_offset + offset;
+ return len;
+}
+
+/*
+ * plugin->u.space_allocator.alloc_blocks()
+ */
+static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
+ reiser4_block_nr *start, reiser4_block_nr *len,
+ reiser4_subvol *subv)
+{
+ int actual_len;
+
+ reiser4_block_nr search_start;
+ reiser4_block_nr search_end;
+
+ assert("zam-398", subv != NULL);
+ assert("zam-412", hint != NULL);
+ assert("zam-397", hint->blk <= reiser4_subvol_block_count(subv));
+
+ if (hint->max_dist == 0)
+ search_end = reiser4_subvol_block_count(subv);
+ else
+ search_end = LIMIT(hint->blk + hint->max_dist,
+ reiser4_subvol_block_count(subv));
+
+ /* We use @hint -> blk as a search start and search from it to the end
+ of the disk or in given region if @hint -> max_dist is not zero */
+ search_start = hint->blk;
+
+ actual_len = bitmap_alloc_forward(&search_start, &search_end, 1,
+ needed, subv);
+
+ /* There is only one bitmap search if max_dist was specified or first
+ pass was from the beginning of the bitmap. We also do one pass for
+ scanning bitmap in backward direction. */
+ if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
+ /* next step is a scanning from 0 to search_start */
+ search_end = search_start;
+ search_start = 0;
+ actual_len =
+ bitmap_alloc_forward(&search_start, &search_end, 1,
+ needed, subv);
+ }
+ if (actual_len == 0)
+ return RETERR(-ENOSPC);
+ if (actual_len < 0)
+ return RETERR(actual_len);
+ *len = actual_len;
+ *start = search_start;
+ return 0;
+}
+
+static int alloc_blocks_backward(reiser4_blocknr_hint *hint, int needed,
+ reiser4_block_nr *start, reiser4_block_nr *len,
+ reiser4_subvol *subv)
+{
+ reiser4_block_nr search_start;
+ reiser4_block_nr search_end;
+ int actual_len;
+
+ ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
+
+ assert("zam-969", super != NULL);
+ assert("zam-970", hint != NULL);
+ assert("zam-971", hint->blk <= reiser4_subvol_block_count(subv));
+
+ search_start = hint->blk;
+ if (hint->max_dist == 0 || search_start <= hint->max_dist)
+ search_end = 0;
+ else
+ search_end = search_start - hint->max_dist;
+
+ actual_len = bitmap_alloc_backward(&search_start, &search_end, 1,
+ needed, subv);
+ if (actual_len == 0)
+ return RETERR(-ENOSPC);
+ if (actual_len < 0)
+ return RETERR(actual_len);
+ *len = actual_len;
+ *start = search_start;
+ return 0;
+}
+
+/*
+ * plugin->u.space_allocator.alloc_blocks()
+ */
+int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *allocator,
+ reiser4_blocknr_hint *hint, int needed,
+ reiser4_block_nr *start, reiser4_block_nr *len,
+ reiser4_subvol *subv)
+{
+ if (hint->backward)
+ return alloc_blocks_backward(hint, needed, start, len, subv);
+ return alloc_blocks_forward(hint, needed, start, len, subv);
+}
+
+/* plugin->u.space_allocator.dealloc_blocks(). */
+/*
+ * It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
+ * nodes deletion is deferred until transaction commit. However, deallocation
+ * of temporary objects like wandered blocks and transaction commit records
+ * requires immediate node deletion from WORKING BITMAP
+ */
+void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
+ reiser4_block_nr start, reiser4_block_nr len,
+ reiser4_subvol *subv)
+{
+ bmap_nr_t bmap;
+ bmap_off_t offset;
+
+ struct bitmap_node *bnode;
+ int ret;
+
+ assert("zam-468", len != 0);
+ check_block_range(&start, &len, subv);
+ parse_blocknr(&start, &bmap, &offset, subv);
+
+ assert("zam-469",
+ offset + len <= bmap_bit_count(subv->super->s_blocksize));
+
+ bnode = get_bnode(subv, bmap);
+
+ assert("zam-470", bnode != NULL);
+
+ ret = load_and_lock_bnode(bnode, subv);
+ assert("zam-481", ret == 0);
+
+ reiser4_clear_bits(bnode_working_data(bnode), offset,
+ (bmap_off_t) (offset + len));
+
+ adjust_first_zero_bit(bnode, offset);
+
+ release_and_unlock_bnode(bnode);
+}
+
+static int check_blocks_one_bitmap(bmap_nr_t bmap, bmap_off_t start_offset,
+ bmap_off_t end_offset, int desired,
+ reiser4_subvol *subv)
+{
+ struct bitmap_node *bnode = get_bnode(subv, bmap);
+ int ret;
+
+ assert("nikita-2215", bnode != NULL);
+
+ ret = load_and_lock_bnode(bnode, subv);
+ assert("zam-626", ret == 0);
+
+ assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
+
+ if (desired) {
+ ret = reiser4_find_next_zero_bit(bnode_working_data(bnode),
+ end_offset, start_offset)
+ >= end_offset;
+ } else {
+ ret = reiser4_find_next_set_bit(bnode_working_data(bnode),
+ end_offset, start_offset)
+ >= end_offset;
+ }
+
+ release_and_unlock_bnode(bnode);
+
+ return ret;
+}
+
+/* plugin->u.space_allocator.check_blocks(). */
+int reiser4_check_blocks_bitmap(const reiser4_block_nr *start,
+ const reiser4_block_nr *len, int desired,
+ reiser4_subvol *subv)
+{
+ struct super_block *super = reiser4_get_current_sb();
+
+ reiser4_block_nr end;
+ bmap_nr_t bmap, end_bmap;
+ bmap_off_t offset, end_offset;
+ const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
+
+ assert("intelfx-9", start != NULL);
+ assert("intelfx-10", ergo(len != NULL, *len > 0));
+
+ if (len != NULL) {
+ check_block_range(start, len, subv);
+ end = *start + *len - 1;
+ } else {
+ /*
+ * on next line, end is used as temporary len
+ * for check_block_range()
+ */
+ end = 1; check_block_range(start, &end, subv);
+ end = *start;
+ }
+
+ parse_blocknr(start, &bmap, &offset, subv);
+
+ if (end == *start) {
+ end_bmap = bmap;
+ end_offset = offset;
+ } else {
+ parse_blocknr(&end, &end_bmap, &end_offset, subv);
+ }
+ ++end_offset;
+
+ assert("intelfx-4", end_bmap >= bmap);
+ assert("intelfx-5", ergo(end_bmap == bmap, end_offset >= offset));
+
+ for (; bmap < end_bmap; bmap++, offset = 0) {
+ if (!check_blocks_one_bitmap(bmap, offset, max_offset, desired,
+ subv)) {
+ return 0;
+ }
+ }
+ return check_blocks_one_bitmap(bmap, offset, end_offset, desired,
+ subv);
+}
+
+/* conditional insertion of @node into atom's overwrite set if it was not there */
+static void cond_add_to_overwrite_set(txn_atom *atom, jnode *node)
+{
+ assert("zam-546", atom != NULL);
+ assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
+ assert("zam-548", node != NULL);
+ assert("edward-1782", node->subvol != NULL);
+
+ spin_lock_atom(atom);
+ spin_lock_jnode(node);
+
+ if (node->atom == NULL) {
+ JF_SET(node, JNODE_OVRWR);
+ insert_into_atom_ovrwr_list(atom, node);
+ } else {
+ assert("zam-549", node->atom == atom);
+ }
+
+ spin_unlock_jnode(node);
+ spin_unlock_atom(atom);
+}
+
+/*
+ * an actor which applies a single delete set entry to COMMIT bitmap
+ */
+static int apply_dset_to_commit_bmap(txn_atom *atom,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr * len,
+ __u32 subv_id, void *data)
+{
+ int ret;
+ bmap_nr_t bmap;
+ bmap_off_t offset;
+ struct bitmap_node *bnode;
+ struct super_block *sb = reiser4_get_current_sb();
+ reiser4_subvol *subv = current_origin(subv_id);
+
+ check_block_range(start, len, subv);
+
+ parse_blocknr(start, &bmap, &offset, subv);
+
+ /* FIXME-ZAM: we assume that all block ranges are allocated by this
+ bitmap-based allocator and each block range can't go over a zone of
+ responsibility of one bitmap block; same assumption is used in
+ other journal hooks in bitmap code. */
+ bnode = get_bnode(subv, bmap);
+ assert("zam-448", bnode != NULL);
+
+ /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
+ assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
+ ret = load_and_lock_bnode(bnode, subv);
+ if (ret)
+ return ret;
+
+ /* put bnode into atom's overwrite set */
+ cond_add_to_overwrite_set(atom, bnode->cjnode);
+
+ data = bnode_commit_data(bnode);
+
+ ret = bnode_check_crc(bnode);
+ if (ret != 0)
+ return ret;
+
+ if (len != NULL) {
+ /* FIXME-ZAM: a check that all bits are set should be there */
+ assert("zam-443",
+ offset + *len <= bmap_bit_count(sb->s_blocksize));
+ reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
+
+ subv->blocks_freed += *len;
+ } else {
+ reiser4_clear_bit(offset, data);
+ subv->blocks_freed ++;
+ }
+ bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
+
+ release_and_unlock_bnode(bnode);
+
+ return 0;
+}
+
+/* plugin->u.space_allocator.pre_commit_hook(). */
+/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
+ rest is done by transaction manager (allocate wandered locations for COMMIT
+ BITMAP blocks, copy COMMIT BITMAP blocks data). */
+/* Only one instance of this function can be running at one given time, because
+ only one transaction can be committed a time, therefore it is safe to access
+ some global variables without any locking */
+
+int reiser4_pre_commit_hook_bitmap(void)
+{
+ txn_atom *atom;
+ struct rb_node *node;
+ struct super_block *super = reiser4_get_current_sb();
+ reiser4_super_info_data *sbinfo = get_super_private(super);
+
+ atom = get_current_atom_locked();
+ assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
+ spin_unlock_atom(atom);
+
+ {
+ /*
+ * scan atom's captured list and find all freshly allocated
+ * nodes, mark corresponded bits in COMMIT BITMAP as used
+ */
+ struct list_head *head = ATOM_CLEAN_LIST(atom);
+ jnode *node = list_entry(head->next, jnode, capture_link);
+
+ while (head != &node->capture_link) {
+ /* we detect freshly allocated jnodes */
+ if (JF_ISSET(node, JNODE_RELOC)) {
+ int ret;
+ bmap_nr_t bmap;
+
+ bmap_off_t offset;
+ bmap_off_t index;
+ struct bitmap_node *bn;
+ __u32 size = bmap_size(super->s_blocksize);
+ __u32 crc;
+ char byte;
+
+ assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
+ assert("zam-460",
+ !reiser4_blocknr_is_fake(&node->blocknr));
+
+ parse_blocknr(&node->blocknr,
+ &bmap, &offset, node->subvol);
+ bn = get_bnode(node->subvol, bmap);
+
+ index = offset >> 3;
+ assert("vpf-276", index < size);
+
+ ret = bnode_check_crc(bnode);
+ if (ret != 0)
+ return ret;
+
+ check_bnode_loaded(bn);
+ load_and_lock_bnode(bn, node->subvol);
+
+ byte = *(bnode_commit_data(bn) + index);
+ reiser4_set_bit(offset, bnode_commit_data(bn));
+
+ crc = adler32_recalc(bnode_commit_crc(bn), byte,
+ *(bnode_commit_data(bn) +
+ index),
+ size - index),
+ bnode_set_commit_crc(bn, crc);
+
+ release_and_unlock_bnode(bn);
+
+ ret = bnode_check_crc(bn);
+ if (ret != 0)
+ return ret;
+ /*
+ * working of this depends on how it inserts
+ * new j-node into clean list, because we are
+ * scanning the same list now. It is OK, if
+ * insertion is done to the list front
+ */
+ cond_add_to_overwrite_set(atom, bn->cjnode);
+ }
+ node = list_entry(node->capture_link.next,
+ jnode, capture_link);
+ }
+ }
+#if 1
+ /*
+ * make sure that ->blocks_feed of all items are properly initialized
+ */
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+ reiser4_subvol *subv;
+ struct atom_brick_info *abi;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ subv = current_origin(abi->brick_id);
+ subv->blocks_freed = 0;
+ }
+#endif
+ /*
+ * This will update ->blocks_freed of every abi
+ */
+ atom_dset_deferred_apply(atom, apply_dset_to_commit_bmap, NULL, 0);
+ /*
+ * Finally, update "committed" version of free blocks counters
+ * for all bricks, which participate in the transaction
+ */
+ spin_lock_reiser4_super(sbinfo); /* FIXME-EDWARD: lock respective
+ subvolume instead of super-block */
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ reiser4_subvol *subv;
+ struct atom_brick_info *abi;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ subv = current_origin(abi->brick_id);
+
+ subv->blocks_free_committed +=
+ (subv->blocks_freed - abi->nr_blocks_allocated);
+ }
+ spin_unlock_reiser4_super(sbinfo);
+ return 0;
+}
+
+/*
+ * plugin->u.space_allocator.init_allocator constructor
+ * of reiser4_space_allocator object. It is called on fs mount
+ */
+int reiser4_init_allocator_bitmap(reiser4_space_allocator *allocator,
+ const struct super_block *super,
+ reiser4_subvol *subv, void *arg)
+{
+ struct bitmap_allocator_data *data = NULL;
+ bmap_nr_t bitmap_blocks_nr;
+ bmap_nr_t i;
+
+ assert("nikita-3039", reiser4_schedulable());
+ /*
+ * getting memory for bitmap allocator private data holder
+ */
+ data = kmalloc(sizeof(struct bitmap_allocator_data),
+ reiser4_ctx_gfp_mask_get());
+ if (data == NULL)
+ return RETERR(-ENOMEM);
+ /*
+ * allocate and initialize array of bnodes
+ */
+ bitmap_blocks_nr = get_nr_bmap(super, subv);
+ /*
+ FIXME-ZAM: it is not clear what to do with huge number of bitmaps
+ which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
+ may I never meet someone who still uses the ia32 architecture when
+ storage devices of that size enter the market, and wants to use ia32
+ with that storage device, much less reiser4. ;-) -Hans).
+ Kmalloc is not possible and, probably, another dynamic data structure
+ should replace a static array of bnodes
+
+ data->bitmap = reiser4_kmalloc((size_t)
+ (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL);
+ */
+ data->bitmap =
+ reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
+ if (data->bitmap == NULL) {
+ kfree(data);
+ return RETERR(-ENOMEM);
+ }
+ for (i = 0; i < bitmap_blocks_nr; i++)
+ init_bnode(data->bitmap + i, i);
+
+ allocator->u.generic = data;
+
+ if (!test_bit(REISER4_DONT_LOAD_BITMAP,
+ &get_super_private(super)->fs_flags)) {
+ /*
+ * Load all bitmap blocks at mount time
+ */
+ __u64 start_time, elapsed_time;
+ struct bitmap_node *bnode;
+ int ret;
+
+ if (REISER4_DEBUG)
+ printk(KERN_INFO "loading reiser4 bitmap...");
+ start_time = jiffies;
+
+ for (i = 0; i < bitmap_blocks_nr; i++) {
+ bnode = data->bitmap + i;
+ ret = load_and_lock_bnode(bnode, subv);
+ if (ret) {
+ reiser4_destroy_allocator_bitmap(allocator,
+ super, subv);
+ return ret;
+ }
+ release_and_unlock_bnode(bnode);
+ }
+ elapsed_time = jiffies - start_time;
+ if (REISER4_DEBUG)
+ printk("...done (%llu jiffies)\n",
+ (unsigned long long)elapsed_time);
+ }
+ return 0;
+}
+
+/**
+ * plugin->u.space_allocator.destroy_allocator
+ * destructor. It is called on fs unmount
+ */
+int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *allocator,
+ const struct super_block *super,
+ reiser4_subvol *subv)
+{
+ bmap_nr_t bitmap_blocks_nr;
+ bmap_nr_t i;
+
+ struct bitmap_allocator_data *data = allocator->u.generic;
+
+ assert("zam-414", data != NULL);
+ assert("zam-376", data->bitmap != NULL);
+
+ bitmap_blocks_nr = get_nr_bmap(super, subv);
+
+ for (i = 0; i < bitmap_blocks_nr; i++) {
+ struct bitmap_node *bnode = data->bitmap + i;
+
+ mutex_lock(&bnode->mutex);
+
+#if REISER4_DEBUG
+ if (atomic_read(&bnode->loaded)) {
+ jnode *wj = bnode->wjnode;
+ jnode *cj = bnode->cjnode;
+
+ assert("zam-480", jnode_page(cj) != NULL);
+ assert("zam-633", jnode_page(wj) != NULL);
+
+ assert("zam-634",
+ memcmp(jdata(wj), jdata(wj),
+ bmap_size(super->s_blocksize)) == 0);
+
+ }
+#endif
+ done_bnode(bnode);
+ mutex_unlock(&bnode->mutex);
+ }
+
+ vfree(data->bitmap);
+ kfree(data);
+
+ allocator->u.generic = NULL;
+
+ return 0;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/space/bitmap.h linux-5.10.2/fs/reiser4/plugin/space/bitmap.h
--- linux-5.10.2.orig/fs/reiser4/plugin/space/bitmap.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/space/bitmap.h 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,49 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
+#define __REISER4_PLUGIN_SPACE_BITMAP_H__
+
+#include "../../dformat.h"
+#include "../../block_alloc.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block */
+/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
+/* declarations of functions implementing methods of space allocator plugin for
+ bitmap based allocator. The functions themselves are in bitmap.c */
+extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
+ const struct super_block *,
+ reiser4_subvol *, void *);
+extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
+ const struct super_block *,
+ reiser4_subvol *);
+extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
+ reiser4_blocknr_hint *, int needed,
+ reiser4_block_nr *start,
+ reiser4_block_nr *len, reiser4_subvol *);
+extern int reiser4_check_blocks_bitmap(const reiser4_block_nr *,
+ const reiser4_block_nr *, int, reiser4_subvol *);
+extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
+ reiser4_block_nr,
+ reiser4_block_nr, reiser4_subvol *);
+extern int reiser4_pre_commit_hook_bitmap(void);
+
+#define reiser4_post_commit_hook_bitmap() do{}while(0)
+#define reiser4_post_write_back_hook_bitmap() do{}while(0)
+#define reiser4_print_info_bitmap(pref, al) do{}while(0)
+
+typedef __u64 bmap_nr_t;
+typedef __u32 bmap_off_t;
+
+#endif /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/space/Makefile linux-5.10.2/fs/reiser4/plugin/space/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/space/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/space/Makefile 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,4 @@
+obj-$(CONFIG_REISER4_FS) += space_plugins.o
+
+space_plugins-objs := \
+ bitmap.o
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/space/space_allocator.h linux-5.10.2/fs/reiser4/plugin/space/space_allocator.h
--- linux-5.10.2.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/space/space_allocator.h 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,85 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#ifndef __SPACE_ALLOCATOR_H__
+#define __SPACE_ALLOCATOR_H__
+
+#include "../../forward.h"
+#include "bitmap.h"
+/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
+ * but... */
+#define DEF_SPACE_ALLOCATOR(allocator) \
+ \
+static inline int sa_init_allocator (reiser4_space_allocator * al, const struct super_block *s, \
+ reiser4_subvol *subv, void * opaque) \
+{ \
+ return reiser4_init_allocator_##allocator (al, s, subv, opaque); \
+} \
+ \
+static inline void sa_destroy_allocator (reiser4_space_allocator *al, const struct super_block *s, \
+ reiser4_subvol *subv) \
+{ \
+ reiser4_destroy_allocator_##allocator (al, s, subv); \
+} \
+ \
+static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint, \
+ int needed, reiser4_block_nr * start, reiser4_block_nr * len, \
+ reiser4_subvol *subv) \
+{ \
+ return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len, subv); \
+} \
+static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len, \
+ reiser4_subvol *subv) \
+{ \
+ reiser4_dealloc_blocks_##allocator (al, start, len, subv); \
+} \
+ \
+static inline int sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired, \
+ reiser4_subvol *subv) \
+{ \
+ return reiser4_check_blocks_##allocator (start, end, desired, subv); \
+} \
+ \
+static inline void sa_pre_commit_hook (void) \
+{ \
+ reiser4_pre_commit_hook_##allocator (); \
+} \
+ \
+static inline void sa_post_commit_hook (void) \
+{ \
+ reiser4_post_commit_hook_##allocator (); \
+} \
+ \
+static inline void sa_post_write_back_hook (void) \
+{ \
+ reiser4_post_write_back_hook_##allocator(); \
+} \
+ \
+static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al) \
+{ \
+ reiser4_print_info_##allocator (prefix, al); \
+}
+
+DEF_SPACE_ALLOCATOR(bitmap)
+
+/* this object is part of reiser4 private in-core super block */
+struct reiser4_space_allocator {
+ union {
+ /* space allocators might use this pointer to reference their
+ * data. */
+ void *generic;
+ } u;
+};
+
+/* __SPACE_ALLOCATOR_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/tail_policy.c linux-5.10.2/fs/reiser4/plugin/tail_policy.c
--- linux-5.10.2.orig/fs/reiser4/plugin/tail_policy.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/tail_policy.c 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,113 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Formatting policy plugins */
+
+/*
+ * Formatting policy plugin is used by object plugin (of regular file) to
+ * convert file between two representations.
+ *
+ * Currently following policies are implemented:
+ * never store file in formatted nodes
+ * always store file in formatted nodes
+ * store file in formatted nodes if file is smaller than 4 blocks (default)
+ */
+
+#include "../tree.h"
+#include "../inode.h"
+#include "../super.h"
+#include "object.h"
+#include "plugin.h"
+#include "node/node.h"
+#include "plugin_header.h"
+
+#include <linux/pagemap.h>
+#include <linux/fs.h> /* For struct inode */
+
+/**
+ * have_formatting_never -
+ * @inode:
+ * @size:
+ *
+ *
+ */
+/* Never store file's tail as direct item */
+/* Audited by: green(2002.06.12) */
+static int have_formatting_never(const struct inode *inode UNUSED_ARG
+ /* inode to operate on */ ,
+ loff_t size UNUSED_ARG/* new object size */)
+{
+ return 0;
+}
+
+/* Always store file's tail as direct item */
+/* Audited by: green(2002.06.12) */
+static int
+have_formatting_always(const struct inode *inode UNUSED_ARG
+ /* inode to operate on */ ,
+ loff_t size UNUSED_ARG/* new object size */)
+{
+ return 1;
+}
+
+/* This function makes test if we should store file denoted @inode as tails only
+ or as extents only. */
+static int
+have_formatting_default(const struct inode *inode UNUSED_ARG
+ /* inode to operate on */ ,
+ loff_t size/* new object size */)
+{
+ assert("umka-1253", inode != NULL);
+
+ if (size > inode->i_sb->s_blocksize * 4)
+ return 0;
+
+ return 1;
+}
+
+/* tail plugins */
+formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
+ [NEVER_TAILS_FORMATTING_ID] = {
+ .h = {
+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
+ .id = NEVER_TAILS_FORMATTING_ID,
+ .pops = NULL,
+ .label = "never",
+ .desc = "Never store file's tail",
+ .linkage = {NULL, NULL}
+ },
+ .have_tail = have_formatting_never
+ },
+ [ALWAYS_TAILS_FORMATTING_ID] = {
+ .h = {
+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
+ .id = ALWAYS_TAILS_FORMATTING_ID,
+ .pops = NULL,
+ .label = "always",
+ .desc = "Always store file's tail",
+ .linkage = {NULL, NULL}
+ },
+ .have_tail = have_formatting_always
+ },
+ [SMALL_FILE_FORMATTING_ID] = {
+ .h = {
+ .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
+ .id = SMALL_FILE_FORMATTING_ID,
+ .pops = NULL,
+ .label = "4blocks",
+ .desc = "store files shorter than 4 blocks in tail items",
+ .linkage = {NULL, NULL}
+ },
+ .have_tail = have_formatting_default
+ }
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/txmod.c linux-5.10.2/fs/reiser4/plugin/txmod.c
--- linux-5.10.2.orig/fs/reiser4/plugin/txmod.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/txmod.c 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,1264 @@
+#include "../forward.h"
+#include "../debug.h"
+#include "../coord.h"
+#include "../plugin/plugin.h"
+#include "../jnode.h"
+#include "../znode.h"
+#include "../block_alloc.h"
+#include "../reiser4.h"
+#include "../flush.h"
+
+/*
+ * This file contains implementation of different transaction models.
+ *
+ * Transaction model is a high-level block allocator, which assigns block
+ * numbers to dirty nodes, and, thereby, decides, how those nodes will be
+ * committed.
+ *
+ * Every dirty node of reiser4 atom can be committed by either of the
+ * following two ways:
+ * 1) via journal;
+ * 2) using "write-anywhere" technique.
+ *
+ * If the allocator doesn't change on-disk location of a node, then
+ * this node will be committed using journalling technique (overwrite).
+ * Otherwise, it will be comitted via write-anywhere technique (relocate):
+ *
+ * relocate <---- allocate --- > overwrite
+ *
+ * So, in our interpretation the 2 traditional "classic" strategies in
+ * committing transactions (journalling and "write-anywhere") are just two
+ * boundary cases: 1) when all nodes are overwritten, and 2) when all nodes
+ * are relocated.
+ *
+ * Besides those 2 boundary cases we can implement in reiser4 the infinite
+ * set of their various combinations, so that user can choose what is really
+ * suitable for his needs.
+ */
+
+/* jnode_make_wander_nolock <- find_flush_start_jnode (special case for znode-above-root)
+ <- jnode_make_wander */
+void jnode_make_wander_nolock(jnode * node);
+
+/* jnode_make_wander <- txmod.forward_alloc_formatted */
+void jnode_make_wander(jnode * node);
+
+/* jnode_make_reloc_nolock <- znode_make_reloc
+ <- unformatted_make_reloc */
+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node);
+
+
+
+ /* Handle formatted nodes in forward context */
+
+
+/**
+ * txmod.forward_alloc_formatted <- allocate_znode <- alloc_pos_and_ancestors <- jnode_flush
+ * <- alloc_one_ancestor <- alloc_pos_and_ancestors <- jnode_flush
+ * <- alloc_one_ancestor (recursive)
+ * <- lock_parent_and_allocate_znode <- squalloc_upper_levels <- check_parents_and_squalloc_upper_levels <- squalloc_upper_levels (recursive)
+ * <- handle_pos_on_formatted
+ * <- handle_pos_on_formatted
+ * <- handle_pos_end_of_twig
+ * <- handle_pos_to_leaf
+ */
+void znode_make_reloc(znode * z, flush_queue_t * fq);
+
+
+ /* Handle unformatted nodes */
+
+
+/* unformatted_make_reloc <- assign_real_blocknrs <- txmod.forward_alloc_unformatted
+ <- txmod.squeeze_alloc_unformatted
+*/
+void unformatted_make_reloc(jnode *node, flush_queue_t *fq);
+
+static void forward_overwrite_unformatted(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index, reiser4_block_nr width);
+
+/* mark_jnode_overwrite <- forward_overwrite_unformatted <- txmod.forward_alloc_unformatted
+ squeeze_overwrite_unformatted <- txmod.squeeze_alloc_unformatted
+*/
+static void mark_jnode_overwrite(struct list_head *jnodes, jnode *node);
+
+int split_extent_unit(coord_t *coord, reiser4_block_nr pos_in_unit,
+ int return_inserted_pos);
+int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index, unsigned long count);
+void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index, reiser4_block_nr count,
+ reiser4_block_nr first, reiser4_subvol *subv);
+int convert_extent_unit(coord_t *coord, reiser4_extent *replace);
+int shift_extent_left_begin(znode *dst, const coord_t *coord,
+ const reiser4_key *key, reiser4_extent *ext);
+
+/*
+ * txmod.forward_alloc_unformatted <- handle_pos_on_twig
+ * txmod.squeeze_alloc_unformatted <- squeeze_right_twig
+ */
+
+/* Common functions */
+
+/**
+ * Mark node JNODE_OVRWR and put it on atom->overwrite_nodes list.
+ * Atom lock and jnode lock should be taken before calling this
+ * function.
+ */
+void jnode_make_wander_nolock(jnode * node)
+{
+ txn_atom *atom;
+
+ assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
+ assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
+ assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
+
+ atom = node->atom;
+
+ assert("zam-895", atom != NULL);
+ assert("zam-894", atom_is_protected(atom));
+
+ JF_SET(node, JNODE_OVRWR);
+ /* move node to atom's overwrite list */
+ list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
+ ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
+}
+
+/*
+ * Same as jnode_make_wander_nolock, but all necessary locks
+ * are taken inside this function.
+ */
+void jnode_make_wander(jnode * node)
+{
+ txn_atom *atom;
+
+ spin_lock_jnode(node);
+ atom = jnode_get_atom(node);
+ assert("zam-913", atom != NULL);
+ assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
+
+ jnode_make_wander_nolock(node);
+ spin_unlock_atom(atom);
+ spin_unlock_jnode(node);
+}
+
+/* this just sets RELOC bit */
+static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
+{
+ assert_spin_locked(&(node->guard));
+ assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
+ assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
+ assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
+ jnode_set_reloc(node);
+}
+
+/*
+ * Mark znode RELOC and put it on flush queue
+ */
+void znode_make_reloc(znode * z, flush_queue_t * fq)
+{
+ jnode *node;
+ txn_atom *atom;
+
+ node = ZJNODE(z);
+ spin_lock_jnode(node);
+
+ atom = jnode_get_atom(node);
+ assert("zam-919", atom != NULL);
+
+ jnode_make_reloc_nolock(fq, node);
+ queue_jnode(fq, node);
+
+ spin_unlock_atom(atom);
+ spin_unlock_jnode(node);
+}
+
+/* Mark unformatted node RELOC and put it on flush queue */
+void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
+{
+ assert("vs-1479", jnode_is_unformatted(node));
+
+ jnode_make_reloc_nolock(fq, node);
+ queue_jnode(fq, node);
+}
+
+/**
+ * mark_jnode_overwrite - assign node to overwrite set
+ * @jnodes: overwrite set list head
+ * @node: jnode to belong to overwrite set
+ *
+ * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
+ * which is an accumulator for nodes before they get to overwrite set list of
+ * atom.
+ */
+static void mark_jnode_overwrite(struct list_head *jnodes, jnode *node)
+{
+ spin_lock_jnode(node);
+
+ assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
+ assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
+
+ JF_SET(node, JNODE_OVRWR);
+ list_move_tail(&node->capture_link, jnodes);
+ ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
+
+ spin_unlock_jnode(node);
+}
+
+static int forward_relocate_unformatted(flush_pos_t *flush_pos,
+ reiser4_extent *ext,
+ extent_state state,
+ oid_t oid, __u64 index,
+ __u64 width, int *exit)
+{
+ int result;
+ coord_t *coord;
+ reiser4_extent replace_ext;
+ reiser4_block_nr protected;
+ reiser4_block_nr start;
+ reiser4_block_nr first_allocated;
+ __u64 allocated;
+ block_stage_t block_stage;
+ reiser4_subvol *subv;
+ reiser4_blocknr_hint nohint;
+
+ *exit = 0;
+ coord = &flush_pos->coord;
+ start = extent_get_start(ext);
+ subv = find_data_subvol(coord);
+
+ assert("edward-1852", item_is_extent(coord));
+
+ if (flush_pos->pos_in_unit) {
+ assert("edward-2118", state == ALLOCATED_EXTENT);
+ /*
+ * split extent unit into two ones. The left one will
+ * be skipped - see the loop in handle_pos_on_twig()
+ */
+ result = split_extent_unit(coord, flush_pos->pos_in_unit,
+ 0 /* leave @coord set
+ to overwritten
+ extent */);
+ flush_pos->pos_in_unit = 0;
+ *exit = 1;
+ return result;
+ }
+ /*
+ * limit number of nodes to allocate
+ */
+ if (flush_pos->nr_to_write < width)
+ width = flush_pos->nr_to_write;
+
+ if (state == ALLOCATED_EXTENT) {
+ /*
+ * all protected nodes are not flushprepped, therefore
+ * they are counted as flush_reserved
+ */
+ block_stage = BLOCK_FLUSH_RESERVED;
+ protected = allocated_extent_slum_size(flush_pos, oid,
+ index, width);
+ if (protected == 0) {
+ flush_pos->state = POS_INVALID;
+ flush_pos->pos_in_unit = 0;
+ *exit = 1;
+ return 0;
+ }
+ } else {
+ block_stage = BLOCK_UNALLOCATED;
+ protected = width;
+ }
+ /*
+ * look at previous unit if possible. If it is allocated, make
+ * preceder more precise
+ */
+ if (coord->unit_pos && (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
+ flush_pos_update_preceder(flush_pos, subv->id,
+ extent_get_start(ext - 1) +
+ extent_get_width(ext - 1));
+ /*
+ * allocate new block numbers for protected nodes
+ */
+ allocate_blocks_unformatted(flush_pos_get_hint(flush_pos,
+ subv->id, &nohint),
+ protected,
+ &first_allocated, &allocated,
+ block_stage, subv);
+ if (state == ALLOCATED_EXTENT)
+ /*
+ * on relocating - free nodes which are going to be
+ * relocated
+ */
+ reiser4_dealloc_blocks(&start, &allocated, 0, BA_DEFER, subv);
+ /*
+ * assign new block numbers to protected nodes
+ */
+ assign_real_blocknrs(flush_pos, oid, index,
+ allocated, first_allocated, subv);
+
+ /* prepare extent which will replace current one */
+ reiser4_set_extent(subv, &replace_ext, first_allocated, allocated);
+
+ /* adjust extent item */
+ result = convert_extent_unit(coord, &replace_ext);
+ if (result != 0 && result != -ENOMEM) {
+ warning("vs-1461",
+ "Failed to allocate extent. Should not happen\n");
+ *exit = 1;
+ return result;
+ }
+ /*
+ * break flush: we prepared for flushing as many blocks as we
+ * were asked for
+ */
+ if (flush_pos->nr_to_write == allocated)
+ flush_pos->state = POS_INVALID;
+ return 0;
+}
+
+static squeeze_result squeeze_relocate_unformatted(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *key,
+ reiser4_key *stop_key)
+{
+ int result;
+ reiser4_extent *ext;
+ __u64 index;
+ __u64 width;
+ reiser4_block_nr start;
+ extent_state state;
+ oid_t oid;
+ reiser4_block_nr first_allocated;
+ __u64 allocated;
+ __u64 protected;
+ reiser4_extent copy_extent;
+ block_stage_t block_stage;
+ reiser4_subvol *subv;
+ reiser4_blocknr_hint nohint;
+
+ assert("edward-1610", flush_pos->pos_in_unit == 0);
+ assert("edward-1611", coord_is_leftmost_unit(coord));
+ assert("edward-1612", item_is_extent(coord));
+
+ subv = find_data_subvol(coord);
+ ext = extent_by_coord(coord);
+ index = extent_unit_index(coord);
+ start = extent_get_start(ext);
+ width = extent_get_width(ext);
+ state = state_of_extent(ext);
+ unit_key_by_coord(coord, key);
+ oid = get_key_objectid(key);
+
+ assert("edward-1613", state != HOLE_EXTENT);
+
+ if (state == ALLOCATED_EXTENT) {
+ /*
+ * all protected nodes are not flushprepped,
+ * therefore they are counted as flush_reserved
+ */
+ block_stage = BLOCK_FLUSH_RESERVED;
+ protected = allocated_extent_slum_size(flush_pos, oid,
+ index, width);
+ if (protected == 0) {
+ flush_pos->state = POS_INVALID;
+ flush_pos->pos_in_unit = 0;
+ return 0;
+ }
+ } else {
+ block_stage = BLOCK_UNALLOCATED;
+ protected = width;
+ }
+ /*
+ * look at previous unit if possible. If it is allocated, make
+ * preceder more precise
+ */
+ if (coord->unit_pos && (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
+ flush_pos_update_preceder(flush_pos, subv->id,
+ extent_get_start(ext - 1) +
+ extent_get_width(ext - 1));
+ /*
+ * allocate new block numbers for protected nodes
+ */
+ allocate_blocks_unformatted(flush_pos_get_hint(flush_pos,
+ subv->id, &nohint),
+ protected,
+ &first_allocated, &allocated,
+ block_stage, subv);
+ /*
+ * prepare extent which will be copied to left
+ */
+ reiser4_set_extent(subv, &copy_extent, first_allocated, allocated);
+ result = shift_extent_left_begin(left, coord, key, &copy_extent);
+
+ if (result == -E_NODE_FULL) {
+ /*
+ * free blocks which were just allocated
+ */
+ reiser4_dealloc_blocks(&first_allocated, &allocated,
+ (state == ALLOCATED_EXTENT) ?
+ BLOCK_FLUSH_RESERVED : BLOCK_UNALLOCATED,
+ BA_PERMANENT, subv);
+ /*
+ * rewind the preceder
+ */
+ flush_pos_update_preceder(flush_pos, subv->id, first_allocated);
+ return SQUEEZE_TARGET_FULL;
+ }
+ if (state == ALLOCATED_EXTENT) {
+ /*
+ * free nodes which were relocated
+ */
+ reiser4_dealloc_blocks(&start, &allocated, 0, BA_DEFER, subv);
+ }
+ /*
+ * assign new block numbers to protected nodes
+ */
+ assign_real_blocknrs(flush_pos, oid, index, allocated,
+ first_allocated, subv);
+ set_key_offset(key,
+ get_key_offset(key) +
+ (allocated << current_blocksize_bits));
+ return SQUEEZE_CONTINUE;
+}
+
+/**
+ * forward_overwrite_unformatted - put bunch of jnodes to overwrite set
+ * @flush_pos: flush position
+ * @oid: objectid of file jnodes belong to
+ * @index: starting index
+ * @width: extent width
+ *
+ * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
+ * overwrite set. Starting from the one with index @index. If end of slum is
+ * detected (node is not found or flushprepped) - stop iterating and set flush
+ * position's state to POS_INVALID.
+ */
+static void forward_overwrite_unformatted(flush_pos_t *flush_pos, oid_t oid,
+ unsigned long index,
+ reiser4_block_nr width)
+{
+ unsigned long i;
+ jnode *node;
+ txn_atom *atom;
+ LIST_HEAD(jnodes);
+
+ atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
+ assert("vs-1478", atom);
+
+ for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
+ node = jlookup(oid, index);
+ if (!node) {
+ flush_pos->state = POS_INVALID;
+ break;
+ }
+ if (jnode_check_flushprepped(node)) {
+ flush_pos->state = POS_INVALID;
+ atomic_dec(&node->x_count);
+ break;
+ }
+ if (node->atom != atom) {
+ flush_pos->state = POS_INVALID;
+ atomic_dec(&node->x_count);
+ break;
+ }
+ mark_jnode_overwrite(&jnodes, node);
+ atomic_dec(&node->x_count);
+ }
+
+ list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
+ spin_unlock_atom(atom);
+}
+
+static squeeze_result squeeze_overwrite_unformatted(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *key,
+ reiser4_key *stop_key)
+{
+ int result;
+ reiser4_extent *ext;
+ __u64 index;
+ __u64 width;
+ reiser4_block_nr start;
+ extent_state state;
+ oid_t oid;
+ reiser4_extent copy_extent;
+ reiser4_subvol *subv;
+
+ assert("vs-1457", flush_pos->pos_in_unit == 0);
+ assert("vs-1467", coord_is_leftmost_unit(coord));
+ assert("vs-1467", item_is_extent(coord));
+
+ subv = find_data_subvol(coord);
+ ext = extent_by_coord(coord);
+ index = extent_unit_index(coord);
+ start = extent_get_start(ext);
+ width = extent_get_width(ext);
+ state = state_of_extent(ext);
+ unit_key_by_coord(coord, key);
+ oid = get_key_objectid(key);
+
+ /*
+ * try to copy unit as it is to left neighbor
+ * and make all first not flushprepped nodes
+ * overwrite nodes
+ */
+ reiser4_set_extent(subv, &copy_extent, start, width);
+
+ result = shift_extent_left_begin(left, coord, key, &copy_extent);
+ if (result == -E_NODE_FULL)
+ return SQUEEZE_TARGET_FULL;
+
+ if (state != HOLE_EXTENT)
+ forward_overwrite_unformatted(flush_pos, oid, index, width);
+
+ set_key_offset(key,
+ get_key_offset(key) + (width << current_blocksize_bits));
+ return SQUEEZE_CONTINUE;
+}
+
+/************************ HYBRID TRANSACTION MODEL ****************************/
+
+/**
+ * This is the default transaction model suggested by Josh MacDonald and
+ * Hans Reiser. This was the single hardcoded transaction mode till Feb 2014
+ * when Edward introduced pure Journalling and pure Write-Anywhere.
+ *
+ * In this mode all relocate-overwrite decisions are result of attempts to
+ * defragment atom's locality.
+ */
+
+/* REVERSE PARENT-FIRST RELOCATION POLICIES */
+
+/* This implements the is-it-close-enough-to-its-preceder? test for relocation
+ in the reverse parent-first relocate context. Here all we know is the
+ preceder and the block number. Since we are going in reverse, the preceder
+ may still be relocated as well, so we can't ask the block allocator "is there
+ a closer block available to relocate?" here. In the _forward_ parent-first
+ relocate context (not here) we actually call the block allocator to try and
+ find a closer location.
+*/
+static int reverse_try_defragment_if_close(flush_pos_t *pos,
+ const reiser4_block_nr * pblk,
+ const reiser4_block_nr * nblk,
+ reiser4_subvol *subv)
+{
+ reiser4_block_nr dist;
+
+ assert("jmacd-7710", *pblk != 0 && *nblk != 0);
+ assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
+ assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
+
+ /* Distance is the absolute value. */
+ dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
+
+ /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
+ its preceder block, do not relocate. */
+ if (dist <= subv->flush.relocate_distance)
+ return 0;
+ return 1;
+}
+
+/**
+ * This function is a predicate that tests for relocation. Always called in the
+ * reverse-parent-first context, when we are asking whether the current node
+ * should be relocated in order to expand the flush by dirtying the parent level
+ * (and thus proceeding to flush that level). When traversing in the forward
+ * parent-first direction (not here), relocation decisions are handled in two
+ * places: allocate_znode() and extent_needs_allocation().
+ */
+static int reverse_should_realloc_formatted_hybrid(jnode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ reiser4_block_nr pblk = 0;
+ reiser4_block_nr nblk = 0;
+
+ assert("jmacd-8989", !jnode_is_root(node));
+ assert("edward-2396", jnode_get_subvol(node) == get_meta_subvol());
+ /*
+ * This function is called only from the
+ * reverse_relocate_check_dirty_parent() and only if the parent
+ * node is clean. This implies that the parent has the real (i.e., not
+ * fake) block number, and, so does the child, because otherwise the
+ * parent would be dirty.
+ */
+
+ /* New nodes are treated as if they are being relocated. */
+ if (JF_ISSET(node, JNODE_CREATED) ||
+ (__leaf_should_relocate(&pos->mfbi) &&
+ jnode_get_level(node) == LEAF_LEVEL))
+ return 1;
+
+ /* Find the preceder. FIXME(B): When the child is an unformatted,
+ previously existing node, the coord may be leftmost even though the
+ child is not the parent-first preceder of the parent. If the first
+ dirty node appears somewhere in the middle of the first extent unit,
+ this preceder calculation is wrong.
+ Needs more logic in here. */
+
+ if (coord_is_leftmost_unit(parent_coord))
+ pblk = *znode_get_block(parent_coord->node);
+ else
+ pblk = pos->mfbi.preceder.blk;
+
+ check_preceder(pblk, get_meta_subvol());
+ if (pblk == 0)
+ /*
+ * preceder is not set, so relocate
+ */
+ return 1;
+
+ nblk = *jnode_get_block(node);
+
+ if (reiser4_blocknr_is_fake(&nblk))
+ /* child is unallocated, mark parent dirty */
+ return 1;
+
+ return reverse_try_defragment_if_close(pos, &pblk,
+ &nblk, get_meta_subvol());
+}
+
+/**
+ * A subroutine of forward_alloc_formatted_hybrid(), this is called first to see
+ * if there is a close position to relocate to. It may return ENOSPC if there is
+ * no close position. If there is no close position it may not relocate. This
+ * takes care of updating the parent node with the relocated block address.
+ *
+ * was allocate_znode_update()
+ */
+static int forward_try_defragment_locality(znode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ int ret;
+ reiser4_block_nr blk;
+ lock_handle uber_lock;
+ int flush_reserved_used = 0;
+ int grabbed;
+ reiser4_context *ctx;
+ reiser4_super_info_data *sbinfo;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ init_lh(&uber_lock);
+
+ ctx = get_current_context();
+ sbinfo = get_super_private(ctx->super);
+
+ grabbed = ctx_subvol_grabbed(ctx, subv->id);
+
+ ret = zload(node);
+ if (ret)
+ return ret;
+
+ if (ZF_ISSET(node, JNODE_CREATED)) {
+ assert("zam-816",
+ reiser4_blocknr_is_fake(znode_get_block(node)));
+ pos->mfbi.preceder.block_stage = BLOCK_UNALLOCATED;
+ } else {
+ pos->mfbi.preceder.block_stage = BLOCK_GRABBED;
+
+ /* The disk space for relocating the @node is already reserved
+ * in "flush reserved" counter if @node is leaf, otherwise we
+ * grab space using BA_RESERVED (means grab space from whole
+ * disk not from only 95%). */
+ if (znode_get_level(node) == LEAF_LEVEL) {
+ /*
+ * earlier (during do_jnode_make_dirty()) we decided
+ * that @node can possibly go into overwrite set and
+ * reserved block for its wandering location.
+ */
+ txn_atom *atom = get_current_atom_locked();
+ assert("nikita-3449",
+ ZF_ISSET(node, JNODE_FLUSH_RESERVED));
+ flush_reserved2grabbed(atom_meta_brick_info(atom),
+ context_meta_brick_info(ctx),
+ (__u64) 1, subv);
+ spin_unlock_atom(atom);
+ /*
+ * we are trying to move node into relocate
+ * set. Allocation of relocated position "uses"
+ * reserved block.
+ */
+ ZF_CLR(node, JNODE_FLUSH_RESERVED);
+ flush_reserved_used = 1;
+ } else {
+ ret = reiser4_grab_space_force((__u64) 1,
+ BA_RESERVED, subv);
+ if (ret != 0)
+ goto exit;
+ }
+ }
+ /*
+ * We may do not use 5% of reserved disk space here
+ * and flush will not pack tightly
+ */
+ ret = reiser4_alloc_block(&pos->mfbi.preceder, &blk,
+ BA_FORMATTED | BA_PERMANENT, subv);
+ if (ret)
+ goto exit;
+
+ if (!ZF_ISSET(node, JNODE_CREATED)) {
+ ret = reiser4_dealloc_block(znode_get_block(node), 0,
+ BA_DEFER | BA_FORMATTED,
+ subv);
+ if (ret)
+ goto exit;
+ }
+
+ if (likely(!znode_is_root(node))) {
+ item_plugin *iplug;
+
+ iplug = item_plugin_by_coord(parent_coord);
+ assert("nikita-2954", iplug->f.update != NULL);
+ iplug->f.update(parent_coord, &blk);
+
+ znode_make_dirty(parent_coord->node);
+ } else {
+ reiser4_tree *tree = znode_get_tree(node);
+ znode *uber;
+ /*
+ * We take a longterm lock on the fake node in order to change
+ * the root block number. This may cause atom fusion
+ */
+ ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
+ &uber_lock);
+ /*
+ * The fake node cannot be deleted, and we must have priority
+ * here, and may not be confused with ENOSPC
+ */
+ assert("jmacd-74412",
+ ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
+
+ if (ret)
+ goto exit;
+
+ uber = uber_lock.node;
+
+ write_lock_tree();
+ tree->root_block = blk;
+ write_unlock_tree();
+
+ znode_make_dirty(uber);
+ }
+ ret = znode_rehash(node, &blk);
+exit:
+ if (ret) {
+ /* Get flush reserved block back if something fails, because
+ * callers assume that on error block wasn't relocated and its
+ * flush reserved block wasn't used. */
+ if (flush_reserved_used) {
+ /*
+ * ok, we failed to move node into relocate
+ * set. Restore status quo.
+ */
+ grabbed2flush_reserved((__u64)1, subv);
+ ZF_SET(node, JNODE_FLUSH_RESERVED);
+ }
+ }
+ zrelse(node);
+ done_lh(&uber_lock);
+ grabbed2free_mark(grabbed, subv);
+ return ret;
+}
+
+/*
+ * Make the final relocate/wander decision during
+ * forward parent-first squalloc for a formatted node
+ */
+static int forward_alloc_formatted_hybrid(znode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ int ret;
+ reiser4_subvol *subv = get_meta_subvol();
+ flush_brick_info *fbi = &pos->mfbi;
+ /**
+ * FIXME(D): We have the node write-locked and should have checked for !
+ * allocated() somewhere before reaching this point, but there can be a
+ * race, so this assertion is bogus.
+ */
+ assert("edward-1614", znode_is_loaded(node));
+ assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
+ assert("jmacd-7988", znode_is_write_locked(node));
+ assert("jmacd-7989", coord_is_invalid(parent_coord)
+ || znode_is_write_locked(parent_coord->node));
+
+ if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
+ znode_is_root(node) || /* We have enough nodes to
+ relocate no matter what */
+ (__leaf_should_relocate(fbi) &&
+ znode_get_level(node) == LEAF_LEVEL)) {
+ /*
+ * No need to decide with new nodes, they are treated the same
+ * as relocate. If the root node is dirty, relocate.
+ */
+ if (fbi->preceder.blk == 0) {
+ /*
+ * preceder is unknown and we have decided to relocate
+ * node -- using of default value for search start is
+ * better than search from block #0.
+ */
+ reiser4_block_nr blk;
+ get_blocknr_hint_default(&blk, subv);
+ fbi->preceder.blk = blk;
+ check_preceder(blk, subv);
+ }
+ goto best_reloc;
+
+ } else if (fbi->preceder.blk == 0) {
+ /*
+ * If we don't know the preceder, leave it where it is
+ */
+ jnode_make_wander(ZJNODE(node));
+ } else {
+ /* Make a decision based on block distance. */
+ reiser4_block_nr dist;
+ reiser4_block_nr nblk = *znode_get_block(node);
+
+ assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
+ assert("jmacd-6173",
+ !reiser4_blocknr_is_fake(&fbi->preceder.blk));
+ assert("jmacd-6174", fbi->preceder.blk != 0);
+
+ if (fbi->preceder.blk == nblk - 1) {
+ /* Ideal. */
+ jnode_make_wander(ZJNODE(node));
+ } else {
+
+ dist = (nblk < fbi->preceder.blk) ?
+ (fbi->preceder.blk - nblk) :
+ (nblk - fbi->preceder.blk);
+ /*
+ * See if we can find a closer block
+ * (forward direction only).
+ */
+ fbi->preceder.max_dist =
+ min((reiser4_block_nr)subv->flush.relocate_distance,
+ dist);
+ fbi->preceder.level = znode_get_level(node);
+
+ ret = forward_try_defragment_locality(node,
+ parent_coord,
+ pos);
+ fbi->preceder.max_dist = 0;
+
+ if (ret && (ret != -ENOSPC))
+ return ret;
+
+ if (ret == 0) {
+ /* Got a better allocation. */
+ znode_make_reloc(node, pos->fq);
+ } else if (dist < subv->flush.relocate_distance) {
+ /* The present allocation is good enough. */
+ jnode_make_wander(ZJNODE(node));
+ } else {
+ /*
+ * Otherwise, try to relocate to the best
+ * position.
+ */
+ best_reloc:
+ ret = forward_try_defragment_locality(node,
+ parent_coord,
+ pos);
+ if (ret != 0)
+ return ret;
+ /*
+ * set JNODE_RELOC bit _after_ node gets
+ * allocated
+ */
+ znode_make_reloc(node, pos->fq);
+ }
+ }
+ }
+ fbi_update_preceder(fbi, *znode_get_block(node));
+ pos->alloc_cnt += 1;
+
+ assert("jmacd-4277", !reiser4_blocknr_is_fake(&fbi->preceder.blk));
+
+ return 0;
+}
+
+static int forward_alloc_unformatted_hybrid(flush_pos_t *flush_pos)
+{
+ coord_t *coord;
+ reiser4_extent *ext;
+ oid_t oid;
+ __u64 index;
+ __u64 width;
+ extent_state state;
+ reiser4_key key;
+ reiser4_subvol *subv;
+
+ assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
+ assert("vs-1469", coord_is_existing_unit(&flush_pos->coord) &&
+ item_is_extent(&flush_pos->coord));
+
+ subv = find_data_subvol(&flush_pos->coord);
+ coord = &flush_pos->coord;
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+
+ if (state == HOLE_EXTENT) {
+ flush_pos->state = POS_INVALID;
+ return 0;
+ }
+ item_key_by_coord(coord, &key);
+ oid = get_key_objectid(&key);
+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
+ width = extent_get_width(ext);
+
+ assert("vs-1457", width > flush_pos->pos_in_unit);
+
+ if (leaf_should_relocate(flush_pos, subv->id) ||
+ state == UNALLOCATED_EXTENT) {
+ int exit;
+ int result;
+ result = forward_relocate_unformatted(flush_pos, ext, state,
+ oid,
+ index, width, &exit);
+ if (exit)
+ return result;
+ } else
+ forward_overwrite_unformatted(flush_pos, oid, index, width);
+
+ flush_pos->pos_in_unit = 0;
+ return 0;
+}
+
+static squeeze_result squeeze_alloc_unformatted_hybrid(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *stop_key)
+{
+ squeeze_result ret;
+ reiser4_key key;
+ reiser4_extent *ext;
+ extent_state state;
+ reiser4_subvol *subv;
+
+ subv = find_data_subvol(coord);
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+
+ if ((leaf_should_relocate(flush_pos, subv->id) &&
+ state == ALLOCATED_EXTENT) ||
+ (state == UNALLOCATED_EXTENT))
+ /*
+ * relocate
+ */
+ ret = squeeze_relocate_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ else
+ /*
+ * (state == ALLOCATED_EXTENT && !flush_pos->leaf_relocate) ||
+ * state == HOLE_EXTENT - overwrite
+ */
+ ret = squeeze_overwrite_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ if (ret == SQUEEZE_CONTINUE)
+ *stop_key = key;
+ return ret;
+}
+
+/*********************** JOURNAL TRANSACTION MODEL ****************************/
+
+static int forward_alloc_formatted_journal(znode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ int ret;
+
+ if (ZF_ISSET(node, JNODE_CREATED)) {
+ if (pos->mfbi.preceder.blk == 0) {
+ /*
+ * preceder is unknown and we have decided to relocate
+ * node -- using of default value for search start is
+ * better than search from block #0.
+ */
+ reiser4_block_nr blk;
+ get_blocknr_hint_default(&blk, get_meta_subvol());
+ pos->mfbi.preceder.blk = blk;
+ }
+ ret = forward_try_defragment_locality(node,
+ parent_coord,
+ pos);
+ if (ret != 0) {
+ warning("edward-1615",
+ "forward defrag failed (%d)", ret);
+ return ret;
+ }
+ /*
+ * set JNODE_RELOC bit _after_ node gets
+ * allocated
+ */
+ znode_make_reloc(node, pos->fq);
+ }
+ else
+ jnode_make_wander(ZJNODE(node));
+
+ pos->mfbi.preceder.blk = *znode_get_block(node);
+ pos->alloc_cnt += 1;
+
+ assert("edward-1616",
+ !reiser4_blocknr_is_fake(&pos->mfbi.preceder.blk));
+ return 0;
+}
+
+static int forward_alloc_unformatted_journal(flush_pos_t *flush_pos)
+{
+
+ coord_t *coord;
+ reiser4_extent *ext;
+ oid_t oid;
+ __u64 index;
+ __u64 width;
+ extent_state state;
+ reiser4_key key;
+
+ coord = &flush_pos->coord;
+
+ assert("edward-1617", flush_pos->state == POS_ON_EPOINT);
+ assert("edward-1618", coord_is_existing_unit(&flush_pos->coord)
+ && item_is_extent(&flush_pos->coord));
+
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+ if (state == HOLE_EXTENT) {
+ flush_pos->state = POS_INVALID;
+ return 0;
+ }
+ item_key_by_coord(coord, &key);
+ oid = get_key_objectid(&key);
+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
+ width = extent_get_width(ext);
+
+ assert("edward-1619", width > flush_pos->pos_in_unit);
+
+ if (state == UNALLOCATED_EXTENT) {
+ int exit;
+ int result;
+ result = forward_relocate_unformatted(flush_pos, ext, state,
+ oid,
+ index, width, &exit);
+ if (exit)
+ return result;
+ }
+ else
+ /*
+ * state == ALLOCATED_EXTENT
+ * keep old allocation
+ */
+ forward_overwrite_unformatted(flush_pos, oid, index, width);
+
+ flush_pos->pos_in_unit = 0;
+ return 0;
+}
+
+static squeeze_result squeeze_alloc_unformatted_journal(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *stop_key)
+{
+ squeeze_result ret;
+ reiser4_key key;
+ reiser4_extent *ext;
+ extent_state state;
+
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+
+ if (state == UNALLOCATED_EXTENT)
+ ret = squeeze_relocate_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ else
+ /*
+ * state == ALLOCATED_EXTENT || state == HOLE_EXTENT
+ */
+ ret = squeeze_overwrite_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ if (ret == SQUEEZE_CONTINUE)
+ *stop_key = key;
+ return ret;
+}
+
+/********************** WA (Write-Anywhere) TRANSACTION MODEL ***************/
+
+static int forward_alloc_formatted_wa(znode * node,
+ const coord_t *parent_coord,
+ flush_pos_t *pos)
+{
+ int ret;
+
+ assert("edward-1620", znode_is_loaded(node));
+ assert("edward-1621", !jnode_check_flushprepped(ZJNODE(node)));
+ assert("edward-1622", znode_is_write_locked(node));
+ assert("edward-1623", coord_is_invalid(parent_coord)
+ || znode_is_write_locked(parent_coord->node));
+
+ if (pos->mfbi.preceder.blk == 0) {
+ /*
+ * preceder is unknown and we have decided to relocate
+ * node -- using of default value for search start is
+ * better than search from block #0.
+ */
+ reiser4_block_nr blk;
+ get_blocknr_hint_default(&blk, get_meta_subvol());
+ pos->mfbi.preceder.blk = blk;
+ }
+ ret = forward_try_defragment_locality(node, parent_coord, pos);
+ if (ret && (ret != -ENOSPC)) {
+ warning("edward-1624",
+ "forward defrag failed (%d)", ret);
+ return ret;
+ }
+ if (ret == 0)
+ znode_make_reloc(node, pos->fq);
+ else {
+ ret = forward_try_defragment_locality(node, parent_coord, pos);
+ if (ret) {
+ warning("edward-1625",
+ "forward defrag failed (%d)", ret);
+ return ret;
+ }
+ /* set JNODE_RELOC bit _after_ node gets allocated */
+ znode_make_reloc(node, pos->fq);
+ }
+ pos->mfbi.preceder.blk = *znode_get_block(node);
+ pos->alloc_cnt += 1;
+
+ assert("edward-1626",
+ !reiser4_blocknr_is_fake(&pos->mfbi.preceder.blk));
+ return 0;
+}
+
+static int forward_alloc_unformatted_wa(flush_pos_t *flush_pos)
+{
+ int exit;
+ int result;
+
+ coord_t *coord;
+ reiser4_extent *ext;
+ oid_t oid;
+ __u64 index;
+ __u64 width;
+ extent_state state;
+ reiser4_key key;
+
+ assert("edward-1627", flush_pos->state == POS_ON_EPOINT);
+ assert("edward-1628", coord_is_existing_unit(&flush_pos->coord)
+ && item_is_extent(&flush_pos->coord));
+
+ coord = &flush_pos->coord;
+
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+ if (state == HOLE_EXTENT) {
+ flush_pos->state = POS_INVALID;
+ return 0;
+ }
+
+ item_key_by_coord(coord, &key);
+ oid = get_key_objectid(&key);
+ index = extent_unit_index(coord) + flush_pos->pos_in_unit;
+ width = extent_get_width(ext);
+
+ assert("edward-1629", width > flush_pos->pos_in_unit);
+ assert("edward-1630",
+ state == ALLOCATED_EXTENT || state == UNALLOCATED_EXTENT);
+ /*
+ * always relocate
+ */
+ result = forward_relocate_unformatted(flush_pos, ext, state, oid,
+ index, width, &exit);
+ if (exit)
+ return result;
+ flush_pos->pos_in_unit = 0;
+ return 0;
+}
+
+static squeeze_result squeeze_alloc_unformatted_wa(znode *left,
+ const coord_t *coord,
+ flush_pos_t *flush_pos,
+ reiser4_key *stop_key)
+{
+ squeeze_result ret;
+ reiser4_key key;
+ reiser4_extent *ext;
+ extent_state state;
+
+ ext = extent_by_coord(coord);
+ state = state_of_extent(ext);
+
+ if (state == HOLE_EXTENT)
+ /*
+ * hole extents are handled in squeeze_overwrite
+ */
+ ret = squeeze_overwrite_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ else
+ ret = squeeze_relocate_unformatted(left, coord,
+ flush_pos, &key, stop_key);
+ if (ret == SQUEEZE_CONTINUE)
+ *stop_key = key;
+ return ret;
+}
+
+/******************************************************************************/
+
+txmod_plugin txmod_plugins[LAST_TXMOD_ID] = {
+ [HYBRID_TXMOD_ID] = {
+ .h = {
+ .type_id = REISER4_TXMOD_PLUGIN_TYPE,
+ .id = HYBRID_TXMOD_ID,
+ .pops = NULL,
+ .label = "hybrid",
+ .desc = "Hybrid Transaction Model",
+ .linkage = {NULL, NULL}
+ },
+ .forward_alloc_formatted = forward_alloc_formatted_hybrid,
+ .reverse_should_realloc_formatted = reverse_should_realloc_formatted_hybrid,
+ .forward_alloc_unformatted = forward_alloc_unformatted_hybrid,
+ .squeeze_alloc_unformatted = squeeze_alloc_unformatted_hybrid
+ },
+ [JOURNAL_TXMOD_ID] = {
+ .h = {
+ .type_id = REISER4_TXMOD_PLUGIN_TYPE,
+ .id = JOURNAL_TXMOD_ID,
+ .pops = NULL,
+ .label = "journal",
+ .desc = "Journalling Transaction Model",
+ .linkage = {NULL, NULL}
+ },
+ .forward_alloc_formatted = forward_alloc_formatted_journal,
+ .reverse_should_realloc_formatted = NULL,
+ .forward_alloc_unformatted = forward_alloc_unformatted_journal,
+ .squeeze_alloc_unformatted = squeeze_alloc_unformatted_journal
+ },
+ [WA_TXMOD_ID] = {
+ .h = {
+ .type_id = REISER4_TXMOD_PLUGIN_TYPE,
+ .id = WA_TXMOD_ID,
+ .pops = NULL,
+ .label = "wa",
+ .desc = "Write-Anywhere Transaction Model",
+ .linkage = {NULL, NULL}
+ },
+ .forward_alloc_formatted = forward_alloc_formatted_wa,
+ .reverse_should_realloc_formatted = NULL,
+ .forward_alloc_unformatted = forward_alloc_unformatted_wa,
+ .squeeze_alloc_unformatted = squeeze_alloc_unformatted_wa
+ }
+};
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/volume/Makefile linux-5.10.2/fs/reiser4/plugin/volume/Makefile
--- linux-5.10.2.orig/fs/reiser4/plugin/volume/Makefile 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/volume/Makefile 2020-12-23 16:07:46.132813334 +0100
@@ -0,0 +1,4 @@
+obj-$(CONFIG_REISER4_FS) += volume_plugins.o
+
+volume_plugins-objs := \
+ volume.o \
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/volume/volume.c linux-5.10.2/fs/reiser4/plugin/volume/volume.c
--- linux-5.10.2.orig/fs/reiser4/plugin/volume/volume.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/volume/volume.c 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,2624 @@
+/*
+ Copyright (c) 2016-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "../../debug.h"
+#include "../../super.h"
+#include "../../inode.h"
+#include "../../plugin/item/brick_symbol.h"
+#include "volume.h"
+
+/**
+ * Implementation of simple and asymmetric logical volumes.
+ *
+ * Simple Volume can consist of only one device. Operation of adding
+ * a brick to such volume will fail. All reiser4 partitions with old
+ * "format40" layout are simple volumes.
+ *
+ * Asymmetric Logical Volume can consist of any number of devices
+ * formatted with "format41" layout, called bricks (or storage
+ * subvolumes).
+ * Mounted asymmetric volume is represented by a table of pointers to
+ * bricks. Its first column represents meta-data brick with its
+ * optional replicas. Other columns represent data bricks with
+ * replicas. Data brick contains only unformatted blocks. Meta-data
+ * brick contains blocks of all types. Asymmetric Logical Volume
+ * contains at least one meta-data brick and any number of data bricks
+ */
+
+#define VOLMAP_MAGIC "R4VoLMaP"
+#define VOLMAP_MAGIC_SIZE (8)
+
+struct voltab_entry {
+ reiser4_block_nr block; /* address of the unformatted voltab block */
+ u32 csum; /* checksum of the voltab block */
+}PACKED;
+
+struct volmap {
+ u32 csum; /* checksum of this volmap block */
+ char magic[8];
+ reiser4_block_nr next; /* disk address of the next volmap block */
+ struct voltab_entry entries [0];
+}PACKED;
+
+static u32 volmap_get_csum(struct volmap *vmap)
+{
+ return le32_to_cpu(get_unaligned(&vmap->csum));
+}
+
+static void volmap_set_csum(struct volmap *vmap, u32 val)
+{
+ put_unaligned(cpu_to_le32(val), &vmap->csum);
+}
+
+static reiser4_block_nr volmap_get_entry_blk(struct volmap *vmap, int nr)
+{
+ return le64_to_cpu(get_unaligned(&vmap->entries[nr].block));
+}
+
+static void volmap_set_entry_blk(struct volmap *vmap, int nr, u64 val)
+{
+ put_unaligned(cpu_to_le64(val), &vmap->entries[nr].block);
+}
+
+static u32 volmap_get_entry_csum(struct volmap *vmap, int nr)
+{
+ return le32_to_cpu(get_unaligned(&vmap->entries[nr].csum));
+}
+
+static void volmap_set_entry_csum(struct volmap *vmap, int nr, u32 val)
+{
+ put_unaligned(cpu_to_le32(val), &vmap->entries[nr].csum);
+}
+
+static reiser4_block_nr get_next_volmap_addr(struct volmap *vmap)
+{
+ return le64_to_cpu(get_unaligned(&vmap->next));
+}
+
+static void set_next_volmap_addr(struct volmap *vmap, reiser4_block_nr val)
+{
+ put_unaligned(cpu_to_le64(val), &vmap->next);
+}
+
+static int balance_volume_asym(struct super_block *sb, u32 flags);
+
+static int voltab_nodes_per_block(void)
+{
+ return (current_blocksize - sizeof (struct volmap)) /
+ sizeof(struct voltab_entry);
+}
+
+static int segments_per_block(reiser4_volume *vol)
+{
+ distribution_plugin *dist_plug = vol->dist_plug;
+
+ return 1 << (current_blocksize_bits - dist_plug->seg_bits);
+}
+
+/**
+ * find a meta-data brick of not yet activated volume
+ */
+reiser4_subvol *find_meta_brick_by_id(reiser4_volume *vol)
+{
+ struct reiser4_subvol *subv;
+
+ list_for_each_entry(subv, &vol->subvols_list, list)
+ if (is_meta_brick_id(subv->id))
+ return subv;
+ return NULL;
+}
+
+/**
+ * Allocate and initialize an array of abstract buckets for an
+ * asymmetric volume.
+ * The notion of abstract bucket encapsulates an original brick
+ * (without replicas). That array should include only DSA members.
+ */
+static bucket_t *create_buckets(void)
+{
+ u32 i, j;
+ bucket_t *ret;
+ reiser4_volume *vol = current_volume();
+ lv_conf *conf = vol->conf;
+ u32 nr_buckets = num_dsa_subvols(vol);
+
+ ret = kmalloc(nr_buckets * sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+
+ for (i = 0, j = 0; i < conf->nr_mslots; i++) {
+ if (conf->mslots[i] == NULL)
+ continue;
+ if (!is_dsa_brick(conf_origin(conf, i)))
+ continue;
+ ret[j] = conf_origin(conf, i);
+ /*
+ * set index in DSA
+ */
+ conf_origin(conf, i)->dsa_idx = j;
+ j++;
+ }
+#if REISER4_DEBUG
+ assert("edward-2194", j == nr_buckets);
+ for (i = 0; i < nr_buckets; i++) {
+ assert("edward-2181", ret[i] != NULL);
+ assert("edward-2195",
+ ((reiser4_subvol *)ret[i])->dsa_idx == i);
+ }
+#endif
+ return (bucket_t *)ret;
+}
+
+static void free_buckets(bucket_t *vec)
+{
+ assert("edward-2233", vec != NULL);
+ kfree(vec);
+}
+
+/**
+ * Allocate and initialize a new array of abstract buckets,
+ * which doesn't contain a bucket @this at position @pos in
+ * the old array @vec. Return the new array.
+ */
+static bucket_t *remove_bucket(bucket_t *vec, u32 numb, u32 pos)
+{
+ bucket_t *new;
+
+ assert("edward-2338", pos < numb);
+
+ new = kmalloc((numb - 1) * sizeof(*new), GFP_KERNEL);
+ if (new) {
+ int i;
+ /*
+ * indexes of all buckets at the right to @pos
+ * get decremented
+ */
+ for (i = pos + 1; i < numb; i++) {
+ assert("edward-2196",
+ ((reiser4_subvol *)(vec[i]))->dsa_idx == i);
+ ((reiser4_subvol *)(vec[i]))->dsa_idx --;
+ }
+ memcpy(new, vec, pos * (sizeof(*new)));
+ memcpy(new + pos, vec + pos + 1,
+ (numb - pos - 1) * sizeof(*new));
+ }
+ return new;
+}
+
+static bucket_t *insert_bucket(bucket_t *vec, bucket_t this, u32 numb, u32 pos)
+{
+ bucket_t *new;
+
+ assert("edward-2339", pos <= numb);
+
+ new = kmalloc((numb + 1) * sizeof(*new), GFP_KERNEL);
+ if (new) {
+ u32 i;
+ /*
+ * indexes of all buckets at @pos and at the right to @pos
+ * get incremented
+ */
+ for (i = pos; i < numb; i++) {
+ assert("edward-2340",
+ ((reiser4_subvol *)(vec[i]))->dsa_idx == i);
+ ((reiser4_subvol *)(vec[i]))->dsa_idx ++;
+ }
+ /*
+ * new bucket gets index @pos
+ */
+ ((reiser4_subvol *)this)->dsa_idx = pos;
+
+ memcpy(new, vec, pos * (sizeof(*new)));
+ new[pos] = this;
+ memcpy(new + pos + 1, vec + pos, (numb - pos) * sizeof(*new));
+ }
+ return new;
+}
+
+static u32 id2idx(u64 id)
+{
+ return current_origin(id)->dsa_idx;
+}
+
+static u64 idx2id(u32 idx)
+{
+ bucket_t *vec = current_buckets();
+
+ return ((reiser4_subvol *)(vec[idx]))->id;
+}
+
+static int num_voltab_nodes(reiser4_volume *vol, int nums_bits)
+{
+ distribution_plugin *dist_plug = vol->dist_plug;
+
+ assert("edward-1818",
+ nums_bits + dist_plug->seg_bits >= current_blocksize_bits);
+
+ return 1 << (nums_bits + dist_plug->seg_bits - current_blocksize_bits);
+}
+
+static int num_volmap_nodes(reiser4_volume *vol, int nums_bits)
+{
+ int result;
+
+ result = num_voltab_nodes(vol, nums_bits) / voltab_nodes_per_block();
+ if (num_voltab_nodes(vol, nums_bits) % voltab_nodes_per_block())
+ result ++;
+ return result;
+}
+
+void release_volinfo_nodes(reiser4_volinfo *vinfo, int dealloc)
+{
+ u64 i;
+
+ if (vinfo->volmap_nodes == NULL)
+ return;
+
+ for (i = 0; i < vinfo->num_volmaps + vinfo->num_voltabs; i++) {
+ struct jnode *node = vinfo->volmap_nodes[i];
+ if (node) {
+ if (dealloc)
+ reiser4_dealloc_block(jnode_get_block(node),
+ 0, BA_FORMATTED | BA_PERMANENT,
+ get_meta_subvol());
+ reiser4_drop_volinfo_head(node);
+ vinfo->volmap_nodes[i] = NULL;
+ }
+ }
+ kfree(vinfo->volmap_nodes);
+ vinfo->volmap_nodes = NULL;
+ vinfo->voltab_nodes = NULL;
+}
+
+static void done_volume_asym(reiser4_volume *vol)
+{
+ /*
+ * release set of abstract buckets
+ */
+ if (vol->buckets) {
+ free_buckets(vol->buckets);
+ vol->buckets = NULL;
+ }
+ release_volinfo_nodes(&vol->volinfo[CUR_VOL_CONF], 0);
+ release_volinfo_nodes(&vol->volinfo[NEW_VOL_CONF], 0);
+}
+
+/**
+ * Load system volume configutation from disk to memory.
+ */
+static int load_volume_dconf(reiser4_subvol *subv)
+{
+ int id = CUR_VOL_CONF;
+ int ret;
+ int i, j;
+ u64 packed_segments = 0;
+ reiser4_volume *vol = super_volume(subv->super);
+ reiser4_volinfo *vinfo = &vol->volinfo[id];
+ distribution_plugin *dist_plug = vol->dist_plug;
+ reiser4_block_nr volmap_loc = subv->volmap_loc[id];
+ u64 voltabs_needed;
+
+ assert("edward-1984", subv->id == METADATA_SUBVOL_ID);
+ assert("edward-2175", subv->volmap_loc[id] != 0);
+
+ if (dist_plug->r.init) {
+ ret = dist_plug->r.init(&vol->dcx, &vol->conf->tab,
+ vol->num_sgs_bits);
+ if (ret)
+ return ret;
+ }
+ vinfo->num_volmaps = num_volmap_nodes(vol, vol->num_sgs_bits);
+ vinfo->num_voltabs = num_voltab_nodes(vol, vol->num_sgs_bits);
+ voltabs_needed = vinfo->num_voltabs;
+
+ vinfo->volmap_nodes =
+ kzalloc((vinfo->num_volmaps + vinfo->num_voltabs) *
+ sizeof(*vinfo->volmap_nodes), GFP_KERNEL);
+
+ if (!vinfo->volmap_nodes)
+ return -ENOMEM;
+
+ vinfo->voltab_nodes = vinfo->volmap_nodes + vinfo->num_volmaps;
+
+ for (i = 0; i < vinfo->num_volmaps; i++) {
+ struct volmap *volmap;
+
+ assert("edward-1819", volmap_loc != 0);
+
+ vinfo->volmap_nodes[i] =
+ reiser4_alloc_volinfo_head(&volmap_loc, subv);
+ if (!vinfo->volmap_nodes[i]) {
+ ret = -ENOMEM;
+ goto unpin;
+ }
+ ret = jload(vinfo->volmap_nodes[i]);
+ if (ret)
+ goto unpin;
+
+ volmap = (struct volmap *)jdata(vinfo->volmap_nodes[i]);
+ /*
+ * load all voltabs pointed by current volmap
+ */
+ for (j = 0;
+ j < voltab_nodes_per_block() && voltabs_needed;
+ j++, voltabs_needed --) {
+
+ reiser4_block_nr voltab_loc;
+
+ voltab_loc = volmap_get_entry_blk(volmap, j);
+ assert("edward-1986", voltab_loc != 0);
+
+ vinfo->voltab_nodes[j] =
+ reiser4_alloc_volinfo_head(&voltab_loc,
+ subv);
+ if (!vinfo->voltab_nodes[j]) {
+ ret = -ENOMEM;
+ goto unpin;
+ }
+ ret = jload(vinfo->voltab_nodes[j]);
+ if (ret)
+ goto unpin;
+
+ dist_plug->v.unpack(&vol->dcx, vol->conf->tab,
+ jdata(vinfo->voltab_nodes[j]),
+ packed_segments,
+ segments_per_block(vol));
+ jrelse(vinfo->voltab_nodes[j]);
+
+ packed_segments += segments_per_block(vol);
+ }
+ volmap_loc = get_next_volmap_addr(volmap);
+ jrelse(vinfo->volmap_nodes[i]);
+ }
+ unpin:
+ release_volinfo_nodes(vinfo, 0);
+ return ret;
+}
+
+static int alloc_volinfo_block(reiser4_block_nr *block, reiser4_subvol *subv)
+{
+ reiser4_blocknr_hint hint;
+
+ reiser4_blocknr_hint_init(&hint);
+ hint.block_stage = BLOCK_NOT_COUNTED;
+
+ return reiser4_alloc_block(&hint, block,
+ BA_FORMATTED | BA_PERMANENT |
+ BA_USE_DEFAULT_SEARCH_START, subv);
+}
+
+static int dealloc_volinfo_block(reiser4_block_nr *block, reiser4_subvol *subv)
+{
+ return reiser4_dealloc_block(block, BLOCK_NOT_COUNTED, BA_DEFER, subv);
+}
+
+/**
+ * Release disk addresses occupied by volume configuration
+ */
+static int release_volume_dconf(reiser4_volume *vol, int id)
+{
+ int ret;
+ int i, j;
+ reiser4_subvol *mtd_subv = get_meta_subvol();
+ reiser4_block_nr volmap_loc = mtd_subv->volmap_loc[id];
+ reiser4_volinfo *vinfo = &vol->volinfo[id];
+ u64 voltabs_needed;
+
+ if (volmap_loc == 0)
+ /* nothing to release */
+ return 0;
+ /*
+ * FIXME: this is a hack to make sure that atom exists
+ */
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+
+ voltabs_needed = vinfo->num_voltabs;
+
+ for (i = 0; i < vinfo->num_volmaps; i++) {
+ jnode *node;
+ struct volmap *volmap;
+
+ assert("edward-1819", volmap_loc != 0);
+
+ node = reiser4_alloc_volinfo_head(&volmap_loc, mtd_subv);
+ if (!node)
+ return -ENOMEM;
+
+ ret = jload(node);
+ if (ret) {
+ reiser4_drop_volinfo_head(node);
+ return ret;
+ }
+ volmap = (struct volmap *)jdata(node);
+ /*
+ * deallocate all voltab blocks pointed out by current volmap
+ */
+ for (j = 0;
+ j < voltab_nodes_per_block() && voltabs_needed;
+ j++, voltabs_needed --) {
+
+ reiser4_block_nr voltab_loc;
+
+ voltab_loc = volmap_get_entry_blk(volmap, j);
+ assert("edward-1987", voltab_loc != 0);
+ dealloc_volinfo_block(&voltab_loc, get_meta_subvol());
+ }
+ dealloc_volinfo_block(&volmap_loc, get_meta_subvol());
+ volmap_loc = get_next_volmap_addr(volmap);
+ jrelse(node);
+ reiser4_drop_volinfo_head(node);
+ }
+ return 0;
+}
+
+/**
+ * Release old on-disk volume configuration and make the new
+ * configuration as "current one".
+ */
+static int update_volume_dconf(reiser4_volume *vol)
+{
+ int ret;
+ reiser4_subvol *mtd_subv = get_meta_subvol();
+
+ ret = release_volume_dconf(vol, CUR_VOL_CONF);
+ if (ret)
+ return ret;
+
+ vol->volinfo[CUR_VOL_CONF] = vol->volinfo[NEW_VOL_CONF];
+ memset(&vol->volinfo[NEW_VOL_CONF], 0, sizeof(reiser4_volinfo));
+
+ mtd_subv->volmap_loc[CUR_VOL_CONF] = mtd_subv->volmap_loc[NEW_VOL_CONF];
+ mtd_subv->volmap_loc[NEW_VOL_CONF] = 0;
+
+ return 0;
+}
+
+/**
+ * Create and pin volinfo nodes, allocate disk addresses for them,
+ * and pack in-memory volume system information to those nodes
+ */
+static int create_volume_dconf(reiser4_volume *vol, int id)
+{
+ int ret;
+ int i, j;
+ u64 packed_segments = 0;
+ reiser4_subvol *meta_subv = get_meta_subvol();
+ reiser4_volinfo *vinfo = &vol->volinfo[id];
+
+ distribution_plugin *dist_plug = vol->dist_plug;
+ reiser4_block_nr volmap_loc;
+ u64 voltabs_needed;
+
+ assert("edward-2177", meta_subv->volmap_loc[id] == 0);
+
+ ret = reiser4_create_atom();
+ if (ret)
+ return ret;
+ /*
+ * allocate disk address of the first volmap block
+ */
+ ret = alloc_volinfo_block(&volmap_loc, meta_subv);
+ if (ret)
+ return ret;
+ /*
+ * set location of the first block of volume config
+ */
+ meta_subv->volmap_loc[id] = volmap_loc;
+
+ vinfo->num_volmaps = num_volmap_nodes(vol, vol->num_sgs_bits);
+ vinfo->num_voltabs = num_voltab_nodes(vol, vol->num_sgs_bits);
+ voltabs_needed = vinfo->num_voltabs;
+
+ vinfo->volmap_nodes =
+ kzalloc((vinfo->num_volmaps + vinfo->num_voltabs) *
+ sizeof(void *), GFP_KERNEL);
+
+ if (!vinfo->volmap_nodes) {
+ /*
+ * release disk address which was just allocated
+ */
+ reiser4_dealloc_block(&volmap_loc, 0,
+ BA_FORMATTED | BA_PERMANENT, meta_subv);
+ meta_subv->volmap_loc[id] = 0;
+ return -ENOMEM;
+ }
+ vinfo->voltab_nodes = vinfo->volmap_nodes + vinfo->num_volmaps;
+
+ for (i = 0; i < vinfo->num_volmaps; i++) {
+ struct volmap *volmap;
+
+ vinfo->volmap_nodes[i] =
+ reiser4_alloc_volinfo_head(&volmap_loc, meta_subv);
+ if (!vinfo->volmap_nodes[i]) {
+ reiser4_dealloc_block(&volmap_loc, 0,
+ BA_FORMATTED | BA_PERMANENT,
+ meta_subv);
+ ret = -ENOMEM;
+ goto unpin;
+ }
+ ret = jinit_new(vinfo->volmap_nodes[i], GFP_KERNEL);
+ if (ret)
+ goto unpin;
+ volmap = (struct volmap *)jdata(vinfo->volmap_nodes[i]);
+ /*
+ * load all voltabs pointed by current volmap
+ */
+ for (j = 0;
+ j < voltab_nodes_per_block() && voltabs_needed;
+ j++, voltabs_needed --) {
+
+ reiser4_block_nr voltab_loc;
+ /*
+ * allocate disk address for voltab node
+ */
+ ret = alloc_volinfo_block(&voltab_loc, meta_subv);
+ if (ret)
+ goto unpin;
+ assert("edward-1838", voltab_loc != 0);
+
+ volmap_set_entry_blk(volmap, j, voltab_loc);
+
+ vinfo->voltab_nodes[j] =
+ reiser4_alloc_volinfo_head(&voltab_loc,
+ meta_subv);
+ if (!vinfo->voltab_nodes[j]) {
+ reiser4_dealloc_block(&voltab_loc, 0,
+ BA_FORMATTED | BA_PERMANENT,
+ meta_subv);
+ ret = -ENOMEM;
+ goto unpin;
+ }
+ ret = jinit_new(vinfo->voltab_nodes[j],
+ GFP_KERNEL);
+ if (ret)
+ goto unpin;
+ dist_plug->v.pack(&vol->dcx,
+ jdata(vinfo->voltab_nodes[j]),
+ packed_segments,
+ segments_per_block(vol));
+ jrelse(vinfo->voltab_nodes[j]);
+
+ packed_segments += segments_per_block(vol);
+ }
+ if (i == vinfo->num_volmaps - 1)
+ /*
+ * current volmap node is the last one
+ */
+ set_next_volmap_addr(volmap, 0);
+ else {
+ /*
+ * allocate disk address of the next volmap block
+ * and store it in the current volmap block
+ */
+ ret = alloc_volinfo_block(&volmap_loc, meta_subv);
+ if (ret)
+ goto unpin;
+ set_next_volmap_addr(volmap, volmap_loc);
+ }
+ /*
+ * update volmap csum
+ */
+ jrelse(vinfo->volmap_nodes[i]);
+ }
+ return 0;
+ unpin:
+ release_volinfo_nodes(vinfo, 1 /* release disk addresses */);
+ meta_subv->volmap_loc[id] = 0;
+ return ret;
+}
+
+/*
+ * Capture an array of jnodes, make them dirty and mark as relocate
+ */
+static int capture_array_nodes(jnode **start, u64 count)
+{
+ u64 i;
+ int ret;
+
+ for (i = 0; i < count; i++) {
+ jnode *node;
+ node = start[i];
+ set_page_dirty_notag(jnode_page(node));
+
+ spin_lock_jnode(node);
+ /*
+ * volinfo nodes are always written to new location
+ */
+ jnode_set_reloc(node);
+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ BUG_ON(ret != 0);
+ jnode_make_dirty_locked(node);
+ spin_unlock_jnode(node);
+ }
+ return 0;
+}
+
+static int capture_volume_dconf(reiser4_volume *vol, int id)
+{
+ int ret;
+ reiser4_volinfo *vinfo = &vol->volinfo[id];
+ /*
+ * Capture format superblock of meta-data brick with
+ * updated location of the first volmap block.
+ */
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ return capture_array_nodes(vinfo->volmap_nodes,
+ vinfo->num_volmaps + vinfo->num_voltabs);
+}
+
+/**
+ * Create volume configuration, put it into transaction
+ * and commit the last one.
+ */
+static int make_volume_dconf(reiser4_volume *vol)
+{
+ int ret;
+
+ ret = create_volume_dconf(vol, NEW_VOL_CONF);
+ if (ret)
+ return ret;
+ ret = capture_volume_dconf(vol, NEW_VOL_CONF);
+ if (ret)
+ goto error;
+ return 0;
+ error:
+ release_volinfo_nodes(&vol->volinfo[NEW_VOL_CONF],
+ 1 /* release disk addresses */);
+ return ret;
+}
+
+/*
+ * This is called at mount time
+ */
+static int load_volume_asym(reiser4_subvol *subv)
+{
+ if (subv->id != METADATA_SUBVOL_ID)
+ /*
+ * configuration of asymmetric volumes
+ * is stored only on meta-data brick
+ */
+ return 0;
+ if (subv->volmap_loc[CUR_VOL_CONF] == 0)
+ /*
+ * volume configuration is absent on disk
+ */
+ return 0;
+
+ return load_volume_dconf(subv);
+}
+
+static u64 get_pos_in_vol(reiser4_volume *vol, reiser4_subvol *subv);
+static int __remove_data_brick(reiser4_volume *vol, reiser4_subvol *victim);
+/*
+ * Init volume system info, which has been already loaded
+ * diring disk formats inialization of subvolumes (components).
+ */
+static int init_volume_asym(struct super_block *sb, reiser4_volume *vol)
+{
+ int ret;
+ u32 subv_id;
+ u32 nr_victims = 0;
+ lv_conf *cur_conf = vol->conf;
+
+ if (!REISER4_PLANB_KEY_ALLOCATION) {
+ warning("edward-2161",
+ "Asymmetric LV requires Plan-B key allocation scheme");
+ return RETERR(-EINVAL);
+ }
+ assert("edward-2341", vol->buckets == NULL);
+ /*
+ * Create an abstract set of buckets for this volume
+ */
+ vol->buckets = create_buckets();
+ if (!vol->buckets)
+ return -ENOMEM;
+
+ if (reiser4_is_set(sb, REISER4_PROXY_ENABLED)) {
+ /*
+ * set proxy subvolume
+ */
+ for_each_mslot(cur_conf, subv_id) {
+ reiser4_subvol *subv;
+
+ if (!conf_mslot_at(cur_conf, subv_id))
+ continue;
+ subv = conf_origin(cur_conf, subv_id);
+ if (subvol_is_set(subv, SUBVOL_IS_PROXY)) {
+ vol->proxy = subv;
+ break;
+ }
+ }
+ assert("edward-2445", vol->proxy != NULL);
+ /*
+ * start a proxy flushing kernel thread here
+ */
+ }
+ if (!reiser4_volume_has_incomplete_removal(sb)) {
+ if (reiser4_volume_is_unbalanced(sb))
+ warning("", "Volume (%s) is unbalanced", sb->s_id);
+ return 0;
+ }
+ assert("edward-2250", current_volume() == vol);
+ /*
+ * prepare the volume for removal completion
+ */
+ assert("edward-2244", vol->new_conf == NULL);
+
+ for_each_mslot(cur_conf, subv_id) {
+ reiser4_subvol *subv;
+
+ if (!conf_mslot_at(cur_conf, subv_id))
+ continue;
+ subv = conf_origin(cur_conf, subv_id);
+ if (subvol_is_set(subv, SUBVOL_TO_BE_REMOVED)) {
+ vol->victim = subv;
+ nr_victims ++;
+ }
+ }
+ if (nr_victims > 1) {
+ warning("edward-2246",
+ "Too many bricks (%u) scheduled for removal",
+ nr_victims);
+ return -EIO;
+ } else if (nr_victims == 0)
+ goto out;
+
+ assert("edward-2251", vol->victim != NULL);
+ /*
+ * vol->victim is not a meta-data brick, as when removing a
+ * meta-data brick we spawn only one checkpoint (similar to
+ * case of adding a brick). It is a speciality of asymmetric
+ * volumes.
+ */
+ assert("edward-2252", !is_meta_brick(vol->victim));
+ /*
+ * new config will be created here
+ */
+ ret = __remove_data_brick(vol, vol->victim);
+ if (ret)
+ return ret;
+ assert("edward-2432", vol->new_conf != NULL);
+ assert("edward-2458", vol->new_conf->tab == NULL);
+
+ if (!subvol_is_set(vol->victim, SUBVOL_IS_PROXY)) {
+ bucket_t *new_vec;
+ new_vec = remove_bucket(vol->buckets, num_dsa_subvols(vol),
+ get_pos_in_dsa(vol->victim));
+ if (!new_vec) {
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return -ENOMEM;
+ }
+ free_buckets(vol->buckets);
+ vol->buckets = new_vec;
+ } else
+ /*
+ * Removing proxy brick was incomplete.
+ * Disable IO requests against him.
+ */
+ reiser4_volume_clear_proxy_io(reiser4_get_current_sb());
+ /*
+ * borrow distribution table from the existing config (which
+ * includes the old set of slots and the new distribution table)
+ */
+ vol->new_conf->tab = vol->conf->tab;
+ /*
+ * Now announce incomplete removal.
+ * Volume configuration will be updated by remove_brick_tail_asym()
+ * called by reiser4_finish_removal().
+ * FIXME: don't ask user to finish removel.
+ * Call reiser4_finish_removal() right here instead.
+ */
+ out:
+ warning("", "Please, complete brick %s removal on volume %s",
+ vol->victim ? vol->victim->name : "Null",
+ sb->s_id);
+ return 0;
+}
+
+/**
+ * Bucket operations.
+ * The following methods translate bucket_t to mirror_t
+ */
+static u64 cap_at_asym(bucket_t *buckets, u64 idx)
+{
+ return ((mirror_t *)buckets)[idx]->data_capacity;
+}
+
+static void *apx_of_asym(bucket_t bucket)
+{
+ return ((mirror_t)bucket)->apx;
+}
+
+static void *apx_at_asym(bucket_t *buckets, u64 index)
+{
+ return apx_of_asym(buckets[index]);
+}
+
+static void apx_set_at_asym(bucket_t *buckets, u64 idx, void *apx)
+{
+ ((mirror_t *)buckets)[idx]->apx = apx;
+}
+
+static u64 *apx_lenp_at_asym(bucket_t *buckets, u64 idx)
+{
+ return &((mirror_t *)buckets)[idx]->apx_len;
+}
+
+static reiser4_subvol *origin_at(slot_t slot)
+{
+ return ((mirror_t *)slot)[0];
+}
+
+static u64 capacity_at(slot_t slot)
+{
+ return origin_at(slot)->data_capacity;
+}
+
+/**
+ * Return number of busy data blocks, which are a subject
+ * for distribution.
+ * @slot represents data brick! This function can not be
+ * applied to meta-data brick.
+ */
+static u64 data_blocks_occupied(slot_t slot)
+{
+ /*
+ * From the total block count on a device we need
+ * to subtract number of system blocks (from disk
+ * format specifications), which are always busy
+ * and are not a subject for distribution
+ */
+ return origin_at(slot)->block_count -
+ origin_at(slot)->min_blocks_used -
+ origin_at(slot)->blocks_free;
+}
+
+/**
+ * Return first non-empty data slot.
+ * If such slot not found, then return NULL
+ */
+static slot_t find_first_nonempty_data_slot(void)
+{
+ u32 subv_id;
+ lv_conf *conf = current_lv_conf();
+
+ for_each_data_mslot(conf, subv_id)
+ if (conf->mslots[subv_id])
+ return conf->mslots[subv_id];
+ return NULL;
+}
+
+static u64 space_occupied_at(slot_t slot)
+{
+ if (is_meta_brick(origin_at(slot))) {
+ slot_t neighbor;
+ /*
+ * In asymmetric LV we don't track a number of busy
+ * data blocks on the meta-data brick. However, we can
+ * calculate it approximately by the portion of busy
+ * data blocks on the neighboring data brick. The last
+ * one has to exist, because there is no need to know
+ * number of data blocks occupied in asymmetric logical
+ * volume consisting of a single meta-data brick (and,
+ * hence, to call this function).
+ */
+ assert("edward-2069", current_nr_origins() > 1);
+
+ neighbor = find_first_nonempty_data_slot();
+ BUG_ON(neighbor == NULL);
+
+ return div64_u64(capacity_at(slot) *
+ space_occupied_at(neighbor),
+ capacity_at(neighbor));
+ } else
+ /* data brick */
+ return data_blocks_occupied(slot);
+}
+
+/**
+ * Return ordered number of brick @subv in the array of all original
+ * bricks of the logical volume
+ */
+static u64 get_pos_in_vol(reiser4_volume *vol, reiser4_subvol *subv)
+{
+ u64 i, j;
+ lv_conf *conf = vol->conf;
+
+ for (i = 0, j = 0; i < conf->nr_mslots; i++) {
+ if (!conf->mslots[i])
+ continue;
+ if (conf->mslots[i][0] == subv)
+ return j;
+ j++;
+ }
+ assert("edward-2197", i == conf->nr_mslots);
+ assert("edward-2198", j == vol_nr_origins(vol));
+ return j;
+}
+
+/**
+ * Returns true, if volume @vol includes @brick in its configuration.
+ * Pre-condition: the volume is read, or write locked
+ */
+int brick_belongs_volume(reiser4_volume *vol, reiser4_subvol *subv)
+{
+ return get_pos_in_vol(vol, subv) < vol_nr_origins(vol);
+}
+
+/**
+ * Create a config, which is similar to @old except the pointer
+ * to distribution table (which is NULL for the clone).
+ */
+static lv_conf *clone_lv_conf(lv_conf *old)
+{
+ lv_conf *new;
+
+ new = alloc_lv_conf(old->nr_mslots);
+ if (new) {
+ memcpy(new->mslots, old->mslots,
+ sizeof(slot_t) * old->nr_mslots);
+ }
+ return new;
+}
+
+static int resize_brick(reiser4_volume *vol, reiser4_subvol *this,
+ long long delta, int *need_balance)
+{
+ int ret;
+ int(*dst_resize_fn)(reiser4_dcx *, const void *, u64, bucket_t);
+
+ assert("edward-2393", delta != 0);
+
+ this->data_capacity += delta;
+
+ *need_balance = 1;
+ if (num_dsa_subvols(vol) == 1 ||
+ (is_meta_brick(this) && !is_dsa_brick(this))) {
+ *need_balance = 0;
+ return 0;
+ }
+ if (delta > 0)
+ dst_resize_fn = vol->dist_plug->v.inc;
+ else
+ dst_resize_fn = vol->dist_plug->v.dec;
+
+ ret = dst_resize_fn(&vol->dcx, vol->conf->tab,
+ get_pos_in_dsa(this), NULL);
+ if (ret)
+ goto error;
+
+ vol->new_conf = clone_lv_conf(vol->conf);
+ if (vol->new_conf == NULL) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ return 0;
+ error:
+ this->data_capacity -= delta;
+ return ret;
+}
+
+static int __add_meta_brick(reiser4_volume *vol, reiser4_subvol *new)
+{
+ assert("edward-2433", is_meta_brick(new));
+ /*
+ * Clone in-memory volume config
+ */
+ vol->new_conf = clone_lv_conf(vol->conf);
+ if (vol->new_conf == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+static int add_meta_brick(reiser4_volume *vol, reiser4_subvol *new,
+ bucket_t **old_vec)
+{
+ int ret;
+ bucket_t *new_vec;
+
+ assert("edward-1820", is_meta_brick(new));
+ /*
+ * We don't need to activate meta-data brick:
+ * it is always active in the mount session of the logical volume.
+ *
+ * Number of bricks and slots in the logical volume remains the same.
+ *
+ * insert @new at the first place in the set of abstract buckets
+ */
+ new_vec = insert_bucket(vol->buckets, new, num_dsa_subvols(vol),
+ METADATA_SUBVOL_ID /* position to insert */);
+ if (!new_vec)
+ return RETERR(-ENOMEM);
+ *old_vec = vol->buckets;
+ vol->buckets = new_vec;
+ /*
+ * Create new distribution table.
+ */
+ ret = vol->dist_plug->v.inc(&vol->dcx, vol->conf->tab,
+ METADATA_SUBVOL_ID /* pos */, new);
+ if (ret)
+ goto error;
+ /*
+ * finally, clone in-memory volume config
+ */
+ ret =__add_meta_brick(vol, new);
+ if (ret)
+ goto error;
+ new->flags |= (1 << SUBVOL_HAS_DATA_ROOM);
+ return 0;
+ error:
+ vol->buckets = *old_vec;
+ free_buckets(new_vec);
+ return ret;
+}
+
+/**
+ * Find first empty slot in the array of volume's slots and
+ * return its offset in that array. If all slots are busy,
+ * then return number of slots.
+ */
+static u32 find_first_empty_slot_off(void)
+{
+ u32 subv_id;
+ lv_conf *conf = current_lv_conf();
+
+ for_each_data_mslot(conf, subv_id)
+ if (conf->mslots[subv_id] == NULL)
+ return subv_id;
+
+ assert("edward-2183", conf->nr_mslots == current_nr_origins());
+ return conf->nr_mslots;
+}
+
+/**
+ * Create new in-memory volume config
+ */
+int __add_data_brick(reiser4_volume *vol, reiser4_subvol *this, u64 pos_in_vol)
+{
+ u64 old_nr_mslots = vol->conf->nr_mslots;
+ slot_t new_slot;
+ /*
+ * Assign internal ID for the new brick
+ */
+ this->id = pos_in_vol;
+ /*
+ * Create new in-memory volume config
+ */
+ new_slot = alloc_mslot(1 + this->num_replicas);
+ if (!new_slot)
+ return RETERR(-ENOMEM);
+
+ ((mirror_t *)new_slot)[this->mirror_id] = this;
+
+ if (pos_in_vol == old_nr_mslots)
+ /*
+ * There is no free slots in the old config -
+ * create a new one with a larger number of slots
+ */
+ vol->new_conf = alloc_lv_conf(1 + old_nr_mslots);
+ else
+ vol->new_conf = alloc_lv_conf(old_nr_mslots);
+ if (!vol->new_conf) {
+ free_mslot(new_slot);
+ return RETERR(-ENOMEM);
+ }
+ memcpy(vol->new_conf->mslots, vol->conf->mslots,
+ sizeof(slot_t) * old_nr_mslots);
+ vol->new_conf->mslots[pos_in_vol] = new_slot;
+
+ atomic_inc(&vol->nr_origins);
+ return 0;
+}
+
+/**
+ * Find a respective position in DSA by mslot index
+ */
+u64 pos_in_dsa_by_mslot(u64 mslot_idx)
+{
+ u32 i,j;
+ reiser4_volume *vol = current_volume();
+
+ for (i = 0, j = 0; i < mslot_idx; i++) {
+ if (!vol->conf->mslots[i])
+ continue;
+ if (!is_dsa_brick(conf_origin(vol->conf, i)))
+ continue;
+ j++;
+ }
+ return j;
+}
+
+int add_data_brick(reiser4_volume *vol, reiser4_subvol *this,
+ bucket_t **old_vec)
+{
+ int ret;
+ u64 free_mslot_idx;
+ u64 pos_in_dsa;
+ bucket_t *new_vec;
+
+ assert("edward-1929", !is_meta_brick(this));
+
+ free_mslot_idx = find_first_empty_slot_off();
+ pos_in_dsa = pos_in_dsa_by_mslot(free_mslot_idx);
+ /*
+ * insert @this to the set of abstract buckets
+ */
+ new_vec = insert_bucket(vol->buckets, this,
+ num_dsa_subvols(vol), pos_in_dsa);
+ if (!new_vec)
+ return -ENOMEM;
+ *old_vec = vol->buckets;
+ vol->buckets = new_vec;
+
+ /*
+ * create new in-memory volume config
+ */
+ ret = __add_data_brick(vol, this, free_mslot_idx);
+ if (ret)
+ goto error;
+ /*
+ * finally, create new distribution table
+ */
+ return vol->dist_plug->v.inc(&vol->dcx, vol->conf->tab,
+ pos_in_dsa, this);
+ error:
+ vol->buckets = *old_vec;
+ free_buckets(new_vec);
+ return ret;
+}
+
+static int resize_brick_asym(reiser4_volume *vol, reiser4_subvol *this,
+ long long delta, int *need_balance)
+{
+ int ret;
+ struct super_block *sb = reiser4_get_current_sb();
+ reiser4_dcx *rdcx = &vol->dcx;
+ distribution_plugin *dist_plug = vol->dist_plug;
+ lv_conf *old_conf = vol->conf;
+
+ assert("edward-1824", vol != NULL);
+ assert("edward-1825", dist_plug != NULL);
+
+ if (is_proxy_brick(this)) {
+ warning("edward-2447",
+ "Can't resize proxy brick %s", this->name);
+ return RETERR(-EINVAL);
+ }
+ ret = dist_plug->v.init(&vol->conf->tab,
+ num_dsa_subvols(vol),
+ vol->num_sgs_bits, rdcx);
+ if (ret)
+ return ret;
+
+ ret = resize_brick(vol, this, delta, need_balance);
+ dist_plug->v.done(rdcx);
+ if (ret)
+ return ret;
+ if (!(*need_balance)) {
+ ret = capture_brick_super(this);
+ if (ret)
+ goto error;
+ printk("reiser4 (%s): Changed data capacity of brick %s.\n",
+ sb->s_id, this->name);
+ return 0;
+ }
+ assert("edward-2394", vol->new_conf != NULL);
+
+ ret = make_volume_dconf(vol);
+ if (ret)
+ goto error;
+ ret = update_volume_dconf(vol);
+ if (ret)
+ goto error;
+ dist_plug->r.replace(&vol->dcx, &vol->new_conf->tab);
+ /*
+ * write unbalanced status and new data capacity to disk
+ */
+ reiser4_volume_set_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ ret = capture_brick_super(this);
+ if (ret)
+ goto error;
+ ret = force_commit_current_atom();
+ if (ret)
+ goto error;
+ /*
+ * publish the new config
+ */
+ rcu_assign_pointer(vol->conf, vol->new_conf);
+ synchronize_rcu();
+ free_lv_conf(old_conf);
+ vol->new_conf = NULL;
+
+ printk("reiser4 (%s): Changed data capacity of brick %s.\n",
+ sb->s_id, this->name);
+ return 0;
+ error:
+ /*
+ * resize failed - it should be repeated in regular context
+ */
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+static int add_proxy_asym(reiser4_volume *vol, reiser4_subvol *new)
+{
+ int ret;
+ lv_conf *old_conf = vol->conf;
+ struct super_block *sb = reiser4_get_current_sb();
+
+ if (is_meta_brick(new) && (vol_nr_origins(vol) == 1)) {
+ warning("edward-2434",
+ "Single meta-data brick can not be proxy");
+ return -EINVAL;
+ }
+ if (new == get_meta_subvol())
+ ret = __add_meta_brick(vol, new);
+ else
+ ret = __add_data_brick(vol, new, find_first_empty_slot_off());
+ if (ret)
+ return ret;
+ assert("edward-2436", vol->new_conf != NULL);
+ assert("edward-2459", vol->new_conf->tab == NULL);
+
+ if (new != get_meta_subvol()) {
+ /*
+ * add a record about @new to the meta-data brick
+ */
+ ret = reiser4_grab_space(estimate_one_insert_into_item(
+ meta_subvol_tree()),
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (ret)
+ goto error;
+ ret = brick_symbol_add(new);
+ if (ret)
+ goto error;
+ }
+ reiser4_volume_set_proxy_enabled(sb);
+ reiser4_volume_set_proxy_io(sb);
+ clear_bit(SUBVOL_HAS_DATA_ROOM, &new->flags);
+
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ ret = capture_brick_super(new);
+ if (ret)
+ goto error;
+ /*
+ * borrow distribution table from the old config
+ */
+ vol->new_conf->tab = old_conf->tab;
+ old_conf->tab = NULL;
+ /*
+ * publish the new config
+ */
+ rcu_assign_pointer(vol->conf, vol->new_conf);
+ synchronize_rcu();
+ free_lv_conf(old_conf);
+ vol->new_conf = NULL;
+ vol->proxy = new;
+ /*
+ * after publishing the new config (not before!)
+ * write superblocks of meta-data brick and the proxy brick respectively
+ */
+ force_commit_current_atom();
+
+ /* FIXME: start a proxy flushing kernel thread here */
+
+ printk("reiser4 (%s): Proxy brick %s has been added.",
+ sb->s_id, new->name);
+ return 0;
+ error:
+ /* adding a proxy should be repeated in regular context */
+
+ clear_bit(SUBVOL_IS_PROXY, &new->flags);
+ if (!is_meta_brick(new))
+ new->flags |= (1 << SUBVOL_HAS_DATA_ROOM);
+ reiser4_volume_clear_proxy_enabled(sb);
+ reiser4_volume_clear_proxy_io(sb);
+
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+/**
+ * Add a @new brick to asymmetric logical volume @vol
+ */
+static int add_brick_asym(reiser4_volume *vol, reiser4_subvol *new)
+{
+ int ret;
+ distribution_plugin *dist_plug = vol->dist_plug;
+ lv_conf *old_conf = vol->conf;
+ struct super_block *sb = reiser4_get_current_sb();
+
+ bucket_t *old_vec;
+ bucket_t *new_vec;
+
+ assert("edward-1931", dist_plug != NULL);
+ assert("edward-2262", vol->conf != NULL);
+ assert("edward-2239", vol->new_conf == NULL);
+
+ if (new->data_capacity == 0) {
+ warning("edward-1962", "Can't add brick of zero capacity");
+ return -EINVAL;
+ }
+ /*
+ * We allow to add meta-data bricks without any other conditions.
+ * In contrast, any data brick to add has to be empty.
+ */
+ if (new != get_meta_subvol() &&
+ reiser4_subvol_used_blocks(new) >
+ reiser4_subvol_min_blocks_used(new)) {
+ warning("edward-2334", "Can't add not empty data brick %s",
+ new->name);
+ return -EINVAL;
+ }
+ if (brick_belongs_volume(vol, new) && is_dsa_brick(new)) {
+ /*
+ * brick already participate in regular data distribution
+ */
+ warning("edward-1963", "Can't add brick to DSA twice");
+ return -EINVAL;
+ }
+ if (subvol_is_set(new, SUBVOL_IS_PROXY))
+ return add_proxy_asym(vol, new);
+
+ /* reserve space on meta-data subvolume for brick symbol insertion */
+ grab_space_enable();
+ ret = reiser4_grab_space(estimate_one_insert_into_item(
+ meta_subvol_tree()),
+ BA_CAN_COMMIT, get_meta_subvol());
+ if (ret)
+ return ret;
+ ret = dist_plug->v.init(&vol->conf->tab,
+ num_dsa_subvols(vol),
+ vol->num_sgs_bits,
+ &vol->dcx);
+ if (ret)
+ return ret;
+ /*
+ * Create new in-memory volume config
+ */
+ if (new == get_meta_subvol())
+ ret = add_meta_brick(vol, new, &old_vec);
+ else
+ ret = add_data_brick(vol, new, &old_vec);
+
+ dist_plug->v.done(&vol->dcx);
+ if (ret)
+ return ret;
+ assert("edward-2240", vol->new_conf != NULL);
+ assert("edward-2462", vol->new_conf->tab == NULL);
+
+ ret = make_volume_dconf(vol);
+ if (ret)
+ goto error;
+ ret = update_volume_dconf(vol);
+ if (ret)
+ goto error;
+ if (new != get_meta_subvol()) {
+ /* add a record about @new to the volume */
+ ret = brick_symbol_add(new);
+ if (ret)
+ goto error;
+ }
+ dist_plug->r.replace(&vol->dcx, &vol->new_conf->tab);
+
+ reiser4_volume_set_unbalanced(reiser4_get_current_sb());
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ ret = capture_brick_super(new);
+ if (ret)
+ goto error;
+ /*
+ * Now publish the new config
+ */
+ rcu_assign_pointer(vol->conf, vol->new_conf);
+ synchronize_rcu();
+ free_lv_conf(old_conf);
+ vol->new_conf = NULL;
+ free_buckets(old_vec);
+ /*
+ * after publishing the new config (not before!)
+ * write superblocks of meta-data brick and the proxy brick respectively
+ */
+ force_commit_current_atom();
+
+ printk("reiser4 (%s): Brick %s has been added.", sb->s_id, new->name);
+ return 0;
+ error:
+ /*
+ * adding a brick should be repeated in regular context
+ */
+ if (is_meta_brick(new))
+ clear_bit(SUBVOL_HAS_DATA_ROOM, &new->flags);
+ reiser4_volume_clear_unbalanced(reiser4_get_current_sb());
+
+ new_vec = vol->buckets;
+ vol->buckets = old_vec;
+ free_buckets(new_vec);
+
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+static u64 space_occupied(void)
+{
+ u64 ret = 0;
+ u64 subv_id;
+ lv_conf *conf = current_lv_conf();
+
+ txnmgr_force_commit_all(reiser4_get_current_sb(), 0);
+
+ for_each_mslot(conf, subv_id) {
+ if (!conf->mslots[subv_id] ||
+ !is_dsa_brick(conf_origin(conf, subv_id)))
+ continue;
+ ret += space_occupied_at(conf->mslots[subv_id]);
+ }
+ return ret;
+}
+
+static int __remove_meta_brick(reiser4_volume *vol)
+{
+ /*
+ * Clone in-memory volume config
+ */
+ vol->new_conf = clone_lv_conf(vol->conf);
+ if (vol->new_conf == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+static int remove_meta_brick(reiser4_volume *vol, bucket_t **old_vec)
+{
+ int ret;
+ reiser4_subvol *mtd_subv = get_meta_subvol();
+ distribution_plugin *dist_plug = vol->dist_plug;
+ bucket_t *new_vec;
+
+ assert("edward-1844", num_dsa_subvols(vol) > 1);
+
+ if (!is_dsa_brick(mtd_subv)) {
+ warning("edward-2331",
+ "Metadata brick doesn't belong to DSA. Can't remove.");
+ return RETERR(-EINVAL);
+ }
+ /*
+ * remove meta-data brick from the set of abstract buckets
+ */
+ new_vec = remove_bucket(vol->buckets, num_dsa_subvols(vol),
+ METADATA_SUBVOL_ID /* position in DSA */);
+ if (!new_vec)
+ return RETERR(-ENOMEM);
+
+ *old_vec = vol->buckets;
+ vol->buckets = new_vec;
+
+ ret = dist_plug->v.dec(&vol->dcx, vol->conf->tab,
+ METADATA_SUBVOL_ID, mtd_subv);
+ if (ret)
+ goto error;
+
+ ret = __remove_meta_brick(vol);
+ if (ret)
+ goto error;
+
+ clear_bit(SUBVOL_HAS_DATA_ROOM, &mtd_subv->flags);
+ assert("edward-1827", !is_dsa_brick(mtd_subv));
+ return 0;
+ error:
+ vol->buckets = *old_vec;
+ free_buckets(new_vec);
+ return ret;
+}
+
+/**
+ * Find rightmost non-empty slot different from the last one.
+ * If not found, return 0. Otherwise return slot's offset + 1.
+ */
+static u32 get_new_nr_mslots(void)
+{
+ u32 i;
+ lv_conf *conf = current_lv_conf();
+
+ assert("edward-2208", conf->nr_mslots > 1);
+
+ for (i = conf->nr_mslots - 2;; i--) {
+ if (conf->mslots[i])
+ return i + 1;
+ if (i == 0)
+ break;
+ }
+ return 0;
+}
+
+static int __remove_data_brick(reiser4_volume *vol, reiser4_subvol *victim)
+{
+ lv_conf *old = vol->conf;
+ u64 old_num_subvols = vol_nr_origins(vol);
+ u64 pos_in_vol;
+ u32 new_nr_mslots;
+
+ assert("edward-2253", vol->new_conf == NULL);
+
+ pos_in_vol = get_pos_in_vol(vol, victim);
+ assert("edward-2199", pos_in_vol < old_num_subvols);
+
+ if (pos_in_vol == old_num_subvols - 1) {
+ /*
+ * removing the rightmost brick -
+ * config will be replaced with a new one
+ * with a smaller number of slots.
+ */
+ new_nr_mslots = get_new_nr_mslots();
+ BUG_ON(new_nr_mslots == 0);
+ } else
+ new_nr_mslots = old->nr_mslots;
+
+ vol->new_conf = alloc_lv_conf(new_nr_mslots);
+ if (!vol->new_conf)
+ return RETERR(-ENOMEM);
+ memcpy(vol->new_conf->mslots, old->mslots,
+ new_nr_mslots * sizeof(slot_t));
+
+ if (pos_in_vol != old_num_subvols - 1) {
+ /*
+ * In the new config mark respective slot as empty
+ */
+ assert("edward-2241",
+ vol->new_conf->mslots[victim->id] != NULL);
+ vol->new_conf->mslots[victim->id] = NULL;
+ }
+ return 0;
+}
+
+static int remove_data_brick(reiser4_volume *vol, reiser4_subvol *victim,
+ bucket_t **old_vec)
+{
+ int ret;
+ u32 pos_in_dsa;
+ bucket_t *new_vec;
+
+ ret = __remove_data_brick(vol, victim);
+ if (ret)
+ return ret;
+
+ pos_in_dsa = get_pos_in_dsa(victim);
+
+ new_vec = remove_bucket(vol->buckets, num_dsa_subvols(vol), pos_in_dsa);
+ if (!new_vec)
+ return -ENOMEM;
+
+ *old_vec = vol->buckets;
+ vol->buckets = new_vec;
+
+ ret = vol->dist_plug->v.dec(&vol->dcx, vol->conf->tab,
+ pos_in_dsa, victim);
+ if (ret) {
+ /*
+ * release resources allocated by and
+ * roll back changes made by __remove_data_brick()
+ */
+ bucket_t *new_vec = vol->buckets;
+ vol->buckets = *old_vec;
+ free_buckets(new_vec);
+
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+ }
+ victim->flags |= (1 << SUBVOL_TO_BE_REMOVED);
+ return 0;
+}
+
+static int remove_proxy_asym(reiser4_volume *vol, reiser4_subvol *victim)
+{
+ int ret;
+ struct super_block *sb = reiser4_get_current_sb();
+
+ /*
+ * Prepare a new volume config with different set of bricks,
+ * not including the proxy brick, and the same distribution
+ * table
+ */
+ if (is_meta_brick(victim))
+ ret = __remove_meta_brick(vol);
+ else
+ ret = __remove_data_brick(vol, victim);
+ if (ret)
+ return ret;
+ assert("edward-2437", vol->new_conf != NULL);
+ assert("edward-2460", vol->new_conf->tab == NULL);
+ /*
+ * borrow distribution table from the old config
+ */
+ vol->new_conf->tab = vol->conf->tab;
+ /*
+ * Disable IO requests against the proxy brick to be removed
+ */
+ reiser4_volume_clear_proxy_io(sb);
+
+ if (!is_meta_brick(victim))
+ capture_brick_super(victim);
+ /*
+ * set unbalanced status and put format super-block
+ * of meta-data brick to the transaction
+ */
+ reiser4_volume_set_unbalanced(sb);
+ reiser4_volume_set_incomplete_removal(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ /*
+ * write unbalanced and incomplete removal status to disk
+ */
+ ret = force_commit_current_atom();
+ if (ret)
+ goto error;
+ /*
+ * the volume will be balanced with the old distribution table -
+ * it will move all data from the proxy brick to other bricks
+ * of the volume
+ */
+ return 0;
+ error:
+ /*
+ * proxy removal should be repeated in regular context
+ */
+ reiser4_volume_clear_unbalanced(sb);
+ reiser4_volume_clear_incomplete_removal(sb);
+
+ reiser4_volume_set_proxy_enabled(sb);
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+static int remove_brick_asym(reiser4_volume *vol, reiser4_subvol *victim)
+{
+ int ret;
+ lv_conf *tmp_conf;
+ lv_conf *old_conf = vol->conf;
+ distribution_plugin *dist_plug = vol->dist_plug;
+ struct super_block *sb = reiser4_get_current_sb();
+ u32 old_nr_dsa_bricks = num_dsa_subvols(vol);
+ bucket_t *old_vec;
+ bucket_t *new_vec;
+
+ assert("edward-1830", vol != NULL);
+ assert("edward-1846", dist_plug != NULL);
+
+ vol->victim = victim;
+
+ if (subvol_is_set(victim, SUBVOL_IS_PROXY))
+ return remove_proxy_asym(vol, victim);
+
+ if (old_nr_dsa_bricks == 1) {
+ warning("edward-1941",
+ "Can't remove the single brick from DSA");
+ return RETERR(-EINVAL);
+ }
+ ret = dist_plug->v.init(&vol->conf->tab,
+ old_nr_dsa_bricks, vol->num_sgs_bits,
+ &vol->dcx);
+ if (ret)
+ return ret;
+
+ if (is_meta_brick(victim))
+ ret = remove_meta_brick(vol, &old_vec);
+ else
+ ret = remove_data_brick(vol, victim, &old_vec);
+ dist_plug->v.done(&vol->dcx);
+ if (ret)
+ return ret;
+ assert("edward-2242", vol->new_conf != NULL);
+ assert("edward-2461", vol->new_conf->tab == NULL);
+
+ ret = make_volume_dconf(vol);
+ if (ret)
+ goto error;
+ ret = update_volume_dconf(vol);
+ if (ret)
+ goto error;
+ dist_plug->r.replace(&vol->dcx, &vol->new_conf->tab);
+ /*
+ * Prepare a temporal config for balancing. This config has
+ * the same set of bricks, but updated distribution table
+ */
+ tmp_conf = clone_lv_conf(old_conf);
+ if (!tmp_conf) {
+ ret = RETERR(-ENOMEM);
+ goto error;
+ }
+ /* borrow distribution table from the new config */
+ tmp_conf->tab = vol->new_conf->tab;
+
+ if (!is_meta_brick(victim))
+ capture_brick_super(victim);
+ /*
+ * set unbalanced status and put format super-block
+ * of meta-data brick to the transaction
+ */
+ reiser4_volume_set_unbalanced(sb);
+ reiser4_volume_set_incomplete_removal(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret) {
+ tmp_conf->tab = NULL;
+ free_lv_conf(tmp_conf);
+ goto error;
+ }
+ /*
+ * write unbalanced and incomplete removal status to disk
+ */
+ ret = force_commit_current_atom();
+ if (ret)
+ goto error;
+ /*
+ * New configuration is written to disk.
+ * From now on the brick removal operation can not be rolled
+ * back on error paths. Instead, it should be completed in a
+ * context of a special completion operation
+ */
+ /*
+ * Publish the temporal config
+ */
+ rcu_assign_pointer(vol->conf, tmp_conf);
+ synchronize_rcu();
+ free_lv_conf(old_conf);
+ free_buckets(old_vec);
+ /*
+ * From now on the file system doesn't allocate disk
+ * addresses on the brick to be removed
+ */
+ printk("reiser4 (%s): Brick %s scheduled for removal.\n",
+ sb->s_id, victim->name);
+ return 0;
+ error:
+ /*
+ * brick removal should be repeated in regular context
+ */
+ reiser4_volume_clear_unbalanced(sb);
+ reiser4_volume_clear_incomplete_removal(sb);
+ if (is_meta_brick(victim))
+ victim->flags |= (1 << SUBVOL_HAS_DATA_ROOM);
+
+ new_vec = vol->buckets;
+ vol->buckets = old_vec;
+ free_buckets(new_vec);
+
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+static int reserve_brick_symbol_del(void)
+{
+ reiser4_subvol *subv = get_meta_subvol();
+ /*
+ * grab one block of meta-data brick to remove
+ * one item from a formatted node
+ */
+ assert("edward-2303",
+ lock_stack_isclean(get_current_lock_stack()));
+ grab_space_enable();
+ return reiser4_grab_reserved(reiser4_get_current_sb(),
+ estimate_one_item_removal(&subv->tree),
+ BA_CAN_COMMIT, subv);
+}
+
+/**
+ * Pre-condition: all data have been moved out of the brick to be removed
+ * by the balancing procedure, and unbalanced status has been successfully
+ * cleared up on disk
+ */
+int remove_brick_tail_asym(reiser4_volume *vol, reiser4_subvol *victim)
+{
+ int ret;
+ int is_proxy = 0;
+ lv_conf *cur_conf = vol->conf;
+
+ if (!is_meta_brick(victim))
+ clear_bit(SUBVOL_TO_BE_REMOVED, &victim->flags);
+
+ if (subvol_is_set(victim, SUBVOL_IS_PROXY)) {
+ is_proxy = 1;
+ assert("edward-2448",
+ !subvol_is_set(victim, SUBVOL_HAS_DATA_ROOM));
+
+ reiser4_volume_clear_proxy_enabled(reiser4_get_current_sb());
+ clear_bit(SUBVOL_IS_PROXY, &victim->flags);
+ if (!is_meta_brick(victim))
+ victim->flags |= (1 << SUBVOL_HAS_DATA_ROOM);
+ }
+ ret = capture_brick_super(victim);
+ if (ret)
+ goto error;
+ /*
+ * We are about to release @victim with replicas.
+ * Before this, it is absolutely necessarily to
+ * commit everything to make sure that there is
+ * no pending IOs addressed to the @victim and its
+ * replicas.
+ *
+ * During this commit @victim gets the last IO
+ * request as a member of the logical volume.
+ */
+ txnmgr_force_commit_all(victim->super, 0);
+ all_grabbed2free();
+ reiser4_txn_restart_current();
+ /*
+ * Publish final config with updated set of slots
+ */
+ if (!is_meta_brick(victim)) {
+ if (reiser4_subvol_used_blocks(victim) >
+ reiser4_subvol_min_blocks_used(victim)) {
+ warning("edward-2335",
+ "Can't remove data brick: not empty %s",
+ victim->name);
+ ret = RETERR(-EAGAIN);
+ goto error;
+ }
+ /*
+ * remove a record about @victim from the volume
+ * and decrement number of bricks in the same
+ * transaction
+ */
+ ret = reserve_brick_symbol_del();
+ if (ret)
+ goto error;
+ ret = brick_symbol_del(victim);
+ reiser4_release_reserved(reiser4_get_current_sb());
+ if (ret)
+ goto error;
+ atomic_dec(&vol->nr_origins);
+ }
+ /*
+ * From now on we can not fail. Moreover, remove_brick_tail()
+ * must not be called for this brick once again.
+ */
+ vol->victim = NULL;
+ /*
+ * Publish final config with updated set of slots,
+ * which doesn't contain @victim
+ */
+ rcu_assign_pointer(vol->conf, vol->new_conf);
+ /*
+ * Release victim with replicas. It is safe,
+ * as at this point nobody is aware of them
+ */
+ if (!is_meta_brick(victim))
+ free_mslot_at(cur_conf, victim->id);
+
+ synchronize_rcu();
+ cur_conf->tab = NULL;
+ free_lv_conf(cur_conf);
+ vol->new_conf = NULL;
+ if (is_proxy)
+ vol->proxy = NULL;
+
+ printk("reiser4 (%s): %s %s has been removed.\n",
+ victim->super->s_id, is_proxy ? "Proxy" : "Brick",
+ victim->name);
+ return 0;
+ error:
+ /*
+ * brick removal should be completed in the context of
+ * a special removal completion operation
+ */
+ victim->flags |= (1 << SUBVOL_TO_BE_REMOVED);
+ if (is_proxy) {
+ set_bit(SUBVOL_IS_PROXY, &victim->flags);
+ reiser4_volume_set_proxy_enabled(reiser4_get_current_sb());
+ if (!is_meta_brick(victim))
+ clear_bit(SUBVOL_HAS_DATA_ROOM, &victim->flags);
+ }
+ return ret;
+}
+
+static int init_volume_simple(struct super_block *sb, reiser4_volume *vol)
+{
+ if (!REISER4_PLANA_KEY_ALLOCATION) {
+ warning("edward-2376",
+ "Simple volume requires Plan-A key allocation scheme");
+ return RETERR(-EINVAL);
+ }
+ return 0;
+}
+
+static u64 meta_subvol_id_simple(void)
+{
+ return METADATA_SUBVOL_ID;
+}
+
+static u64 calc_brick_simple(lv_conf *conf, const struct inode *inode,
+ loff_t offset)
+{
+ return METADATA_SUBVOL_ID;
+}
+
+static int remove_brick_simple(reiser4_volume *vol, reiser4_subvol *this)
+{
+ warning("", "remove_brick operation is undefined for simple volumes");
+ return -EINVAL;
+}
+
+static int resize_brick_simple(reiser4_volume *vol, reiser4_subvol *this,
+ long long delta, int *need_balance)
+{
+ warning("", "resize operation is undefined for simple volumes");
+ return -EINVAL;
+}
+
+static int add_brick_simple(reiser4_volume *vol, reiser4_subvol *new)
+{
+ warning("", "add_brick operation is undefined for simple volumes");
+ return -EINVAL;
+}
+
+static int balance_volume_simple(struct super_block *sb, u32 flags)
+{
+ warning("", "balance operation is undefined for simple volumes");
+ return -EINVAL;
+}
+
+static inline u32 get_seed(oid_t oid, reiser4_volume *vol)
+{
+ u32 seed;
+
+ put_unaligned(cpu_to_le64(oid), &oid);
+
+ seed = murmur3_x86_32((const char *)&oid, sizeof(oid), ~0);
+ seed = murmur3_x86_32(vol->uuid, 16, seed);
+ return seed;
+}
+
+static u64 calc_brick_asym(lv_conf *conf, const struct inode *inode,
+ loff_t offset)
+{
+ assert("edward-2267", conf != NULL);
+
+ if (!conf->tab) {
+ /*
+ * DSA includes only one brick. It is either meta-data
+ * brick, or one of the next two bricks at the right
+ */
+ assert("edward-2474", num_dsa_subvols(current_volume()) == 1);
+ return meta_brick_belongs_dsa() ? METADATA_SUBVOL_ID :
+ is_dsa_brick(conf_origin(conf,
+ METADATA_SUBVOL_ID + 1)) ?
+ METADATA_SUBVOL_ID + 1 : METADATA_SUBVOL_ID + 2;
+ } else {
+ u64 stripe_idx;
+ reiser4_volume *vol = current_volume();
+ distribution_plugin *dist_plug = current_dist_plug();
+
+ if (vol->stripe_bits) {
+ stripe_idx = offset >> vol->stripe_bits;
+ put_unaligned(cpu_to_le64(stripe_idx), &stripe_idx);
+ } else
+ stripe_idx = 0;
+
+ return dist_plug->r.lookup(&vol->dcx, inode,
+ (const char *)&stripe_idx,
+ sizeof(stripe_idx),
+ get_seed(get_inode_oid(inode), vol),
+ conf->tab);
+ }
+}
+
+u64 get_meta_subvol_id(void)
+{
+ return current_vol_plug()->meta_subvol_id();
+}
+
+reiser4_subvol *get_meta_subvol(void)
+{
+ return current_origin(get_meta_subvol_id());
+}
+
+reiser4_subvol *super_meta_subvol(struct super_block *super)
+{
+ return super_origin(super, super_vol_plug(super)->meta_subvol_id());
+}
+
+u64 find_brick_simple(const coord_t *coord)
+{
+ return METADATA_SUBVOL_ID;
+}
+
+int print_volume_simple(struct super_block *sb, struct reiser4_vol_op_args *args)
+{
+ reiser4_volume *vol = super_volume(sb);
+
+ args->u.vol.nr_bricks = 1;
+ memcpy(args->u.vol.id, vol->uuid, 16);
+ args->u.vol.vpid = vol->vol_plug->h.id;
+ args->u.vol.dpid = vol->dist_plug->h.id;
+ args->u.vol.stripe_bits = vol->stripe_bits;
+ args->u.vol.fs_flags = get_super_private(sb)->fs_flags;
+ args->u.vol.nr_mslots = vol->conf->nr_mslots;
+ args->u.vol.nr_volinfo_blocks = 0;
+ return 0;
+}
+
+int print_brick_simple(struct super_block *sb, struct reiser4_vol_op_args *args)
+{
+ reiser4_subvol *subv;
+ reiser4_volume *vol = super_volume(sb);
+
+ spin_lock_reiser4_super(get_super_private(sb));
+
+ subv = vol->conf->mslots[0][0];
+ strncpy(args->d.name, subv->name, REISER4_PATH_NAME_MAX + 1);
+ memcpy(args->u.brick.ext_id, subv->uuid, 16);
+ args->u.brick.int_id = subv->id;
+ args->u.brick.nr_replicas = subv->num_replicas;
+ args->u.brick.subv_flags = subv->flags;
+ args->u.brick.block_count = subv->block_count;
+ args->u.brick.data_capacity = subv->data_capacity;
+ args->u.brick.blocks_used = subv->blocks_used;
+ args->u.brick.system_blocks = subv->min_blocks_used;
+ args->u.brick.volinfo_addr = 0;
+
+ spin_unlock_reiser4_super(get_super_private(sb));
+ return 0;
+}
+
+u64 find_brick_asym(const coord_t *coord)
+{
+ reiser4_key key;
+ assert("edward-1957", coord != NULL);
+
+ switch(item_id_by_coord(coord)) {
+ case NODE_POINTER_ID:
+ case EXTENT40_POINTER_ID:
+ return METADATA_SUBVOL_ID;
+ case EXTENT41_POINTER_ID:
+ return get_key_ordering(item_key_by_coord(coord, &key));
+ default:
+ impossible("edward-2018", "Bad item ID");
+ return METADATA_SUBVOL_ID;
+ }
+}
+
+/**
+ * Convert ordered number @idx of brick in the logical volume
+ * to its internal id
+ */
+static u32 brick_idx_to_id(reiser4_volume *vol, u32 idx)
+{
+ u32 i, j;
+ /*
+ * return idx-th non-zero slot
+ */
+ for (i = 0, j = 0; i < vol->conf->nr_mslots; i++) {
+ if (vol->conf->mslots[i]) {
+ if (j == idx)
+ return i;
+ else
+ j ++;
+ }
+ }
+ BUG_ON(1);
+}
+
+int print_volume_asym(struct super_block *sb, struct reiser4_vol_op_args *args)
+{
+ reiser4_volume *vol = super_volume(sb);
+ lv_conf *conf = vol->conf;
+ reiser4_volinfo *vinfo = &vol->volinfo[CUR_VOL_CONF];
+
+ args->u.vol.nr_bricks = vol_nr_origins(vol);
+ args->u.vol.bricks_in_dsa = num_dsa_subvols(vol);
+ memcpy(args->u.vol.id, vol->uuid, 16);
+ args->u.vol.vpid = vol->vol_plug->h.id;
+ args->u.vol.dpid = vol->dist_plug->h.id;
+ args->u.vol.stripe_bits = vol->stripe_bits;
+ args->u.vol.nr_sgs_bits = vol->num_sgs_bits;
+ args->u.vol.fs_flags = get_super_private(sb)->fs_flags;
+ args->u.vol.nr_mslots = conf->nr_mslots;
+ args->u.vol.nr_volinfo_blocks = vinfo->num_volmaps + vinfo->num_voltabs;
+ return 0;
+}
+
+int print_brick_asym(struct super_block *sb, struct reiser4_vol_op_args *args)
+{
+ int ret = 0;
+ u32 id; /* internal ID */
+ u64 brick_idx; /* ordered number of the brick in the logical volume */
+
+ reiser4_volume *vol = super_volume(sb);
+ lv_conf *conf = vol->conf;
+ reiser4_subvol *subv;
+
+ spin_lock_reiser4_super(get_super_private(sb));
+
+ brick_idx = args->s.brick_idx;
+ if (brick_idx >= vol_nr_origins(vol)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ id = brick_idx_to_id(vol, brick_idx);
+ assert("edward-2446", conf->mslots[id] != NULL);
+
+ subv = conf->mslots[id][0];
+ strncpy(args->d.name, subv->name, REISER4_PATH_NAME_MAX + 1);
+ memcpy(args->u.brick.ext_id, subv->uuid, 16);
+ args->u.brick.int_id = subv->id;
+ args->u.brick.nr_replicas = subv->num_replicas;
+ args->u.brick.subv_flags = subv->flags;
+ args->u.brick.block_count = subv->block_count;
+ args->u.brick.data_capacity = subv->data_capacity;
+ args->u.brick.blocks_used = subv->blocks_used;
+ args->u.brick.system_blocks = subv->min_blocks_used;
+ args->u.brick.volinfo_addr = subv->volmap_loc[CUR_VOL_CONF];
+ out:
+ spin_unlock_reiser4_super(get_super_private(sb));
+ return ret;
+}
+
+static int scale_volume_asym(struct super_block *sb, unsigned factor_bits)
+{
+ int ret;
+ reiser4_volume *vol = super_volume(sb);
+ lv_conf *old_conf = vol->conf;
+ distribution_plugin *dist_plug = vol->dist_plug;
+
+ ret = dist_plug->v.init(&vol->conf->tab, num_dsa_subvols(vol),
+ vol->num_sgs_bits, &vol->dcx);
+ if (ret)
+ return ret;
+ ret = dist_plug->v.spl(&vol->dcx, vol->conf->tab, factor_bits);
+ dist_plug->v.done(&vol->dcx);
+ if (ret)
+ return ret;
+ vol->num_sgs_bits += factor_bits;
+
+ vol->new_conf = clone_lv_conf(vol->conf);
+ if (vol->new_conf == NULL) {
+ ret = RETERR(-ENOMEM);
+ goto error;
+ }
+ ret = make_volume_dconf(vol);
+ if (ret)
+ goto error;
+ ret = update_volume_dconf(vol);
+ if (ret)
+ goto error;
+
+ dist_plug->r.replace(&vol->dcx, &vol->new_conf->tab);
+
+ reiser4_volume_set_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ /*
+ * write unbalanced status to disk
+ */
+ ret = force_commit_current_atom();
+ if (ret)
+ goto error;
+ /*
+ * Now publish the new config
+ */
+ rcu_assign_pointer(vol->conf, vol->new_conf);
+ synchronize_rcu();
+ free_lv_conf(old_conf);
+ vol->new_conf = NULL;
+ return 0;
+ error:
+ /*
+ * the scale operation should be repeated in regular context
+ */
+ vol->num_sgs_bits -= factor_bits;
+ free_lv_conf(vol->new_conf);
+ vol->new_conf = NULL;
+ return ret;
+}
+
+struct reiser4_iterate_context {
+ reiser4_key curr;
+ reiser4_key next;
+};
+
+/**
+ * Check if @coord is an extent item.
+ * If yes, then store its key as "current" in the context
+ * and return 0 to terminate iteration
+ */
+static int iter_find_start(reiser4_tree *tree, coord_t *coord,
+ lock_handle *lh, void *arg)
+{
+ int ret;
+ struct reiser4_iterate_context *ictx = arg;
+
+ assert("edward-2121", ictx != NULL);
+
+ ret = zload(coord->node);
+ if (ret)
+ return ret;
+
+ if (!item_is_extent(coord)) {
+ assert("edward-1878", item_is_internal(coord));
+ /* continue iteration */
+ zrelse(coord->node);
+ return 1;
+ }
+ item_key_by_coord(coord, &ictx->curr);
+ zrelse(coord->node);
+ return 0;
+}
+
+/**
+ * Check if @coord is an extent item of file, which doesn't
+ * own "current" key in the iteration context. If so, then
+ * store its key as "next" in the context and return 0 to
+ * terminate iteration.
+ */
+static int iter_find_next(reiser4_tree *tree, coord_t *coord,
+ lock_handle *lh, void *arg)
+{
+ int ret;
+ struct reiser4_iterate_context *ictx = arg;
+
+ assert("edward-1879", ictx != NULL);
+
+ ret = zload(coord->node);
+ if (ret)
+ return ret;
+ if (!item_is_extent(coord)) {
+ assert("edward-1880", item_is_internal(coord));
+ /* continue iteration */
+ zrelse(coord->node);
+ return 1;
+ }
+ item_key_by_coord(coord, &ictx->next);
+ zrelse(coord->node);
+
+ if (get_key_objectid(&ictx->next) ==
+ get_key_objectid(&ictx->curr))
+ /*
+ * found chunk of body of same file,
+ * continue iteration
+ */
+ return 1;
+ return 0;
+}
+
+/**
+ * Migrate all data blocks of a regular file in asymmetric logical volume
+ */
+static int migrate_file_asym(struct inode *inode, u64 dst_idx)
+{
+ reiser4_volume *vol = super_volume(inode->i_sb);
+ u64 dst_id;
+
+ if (inode_file_plugin(inode)->migrate == NULL)
+ return 0;
+ if (dst_idx >= vol_nr_origins(vol))
+ return RETERR(-EINVAL);
+
+ dst_id = brick_idx_to_id(vol, dst_idx);
+ return inode_file_plugin(inode)->migrate(inode, &dst_id);
+}
+
+
+static inline int file_is_migratable(struct inode *inode,
+ struct super_block *sb, u32 flags)
+{
+ if (inode_file_plugin(inode)->migrate == NULL)
+ return 0;
+ if (flags & VBF_MIGRATE_ALL)
+ return 1;
+ return !reiser4_inode_get_flag(inode, REISER4_FILE_IMMOBILE);
+}
+
+int inode_clr_immobile(struct inode *inode);
+
+/**
+ * Balance an asymmetric logical volume. See description of the method
+ * in plugin.h
+ *
+ * @super: super-block of the volume to be balanced;
+ *
+ * Implementation details:
+ *
+ * Walk from left to right along the twig level of the storage tree
+ * and for every found regular file's inode migrate its data blocks.
+ *
+ * Stat-data (on-disk inodes) are located on leaf level, nevertheless
+ * we scan twig level, recovering stat-data from extent items. Simply
+ * because scanning twig level is ~1000 times faster (thanks to Hans,
+ * who had insisted on EOTTL at the time).
+ *
+ * When scanning twig level we obviously miss empty files (i.e. files
+ * without bodies). It doesn't lead to any problems, as there is nothing
+ * to migrate for those files.
+ *
+ * FIXME: use hint/seal to not traverse tree every time when searching
+ * for a position by "current" key of the iteration context.
+ */
+int balance_volume_asym(struct super_block *super, u32 flags)
+{
+ int ret;
+ coord_t coord;
+ lock_handle lh;
+ reiser4_key start_key;
+ struct reiser4_iterate_context ictx;
+ time64_t start;
+ /*
+ * Set a start key (key of the leftmost object on the
+ * TWIG level) to scan from.
+ * FIXME: This is a hack. Implement find_start_key() to
+ * find the leftmost object on the TWIG level instead.
+ */
+ reiser4_key_init(&start_key);
+ set_key_locality(&start_key, 41 /* FORMAT40_ROOT_LOCALITY */);
+ set_key_type(&start_key, KEY_SD_MINOR);
+ set_key_objectid(&start_key, 42 /* FORMAT40_ROOT_OBJECTID */);
+
+ memset(&ictx, 0, sizeof(ictx));
+
+ assert("edward-1881", super != NULL);
+
+ printk("reiser4 (%s): Started balancing...\n", super->s_id);
+ start = ktime_get_seconds();
+
+ init_lh(&lh);
+ /*
+ * Prepare start position: find leftmost item on the twig level.
+ * For meta-data brick of format40 such item always exists, even
+ * in the case of empty volume
+ */
+ ret = coord_by_key(meta_subvol_tree(), &start_key,
+ &coord, &lh, ZNODE_READ_LOCK,
+ FIND_EXACT, TWIG_LEVEL, TWIG_LEVEL,
+ CBK_UNIQUE, NULL /* read-ahead info */);
+ if (IS_CBKERR(ret)) {
+ warning("edward-2154", "cbk error when balancing (%d)", ret);
+ done_lh(&lh);
+ goto error;
+ }
+ assert("edward-2160", coord.node->level == TWIG_LEVEL);
+
+ coord.item_pos = 0;
+ coord.unit_pos = 0;
+ coord.between = AT_UNIT;
+ /*
+ * find leftmost extent on the twig level
+ */
+ ret = reiser4_iterate_tree(meta_subvol_tree(), &coord, &lh,
+ iter_find_start, &ictx, ZNODE_READ_LOCK, 0);
+ done_lh(&lh);
+ if (ret < 0) {
+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
+ /* volume doesn't contain data blocks */
+ goto done;
+ goto error;
+ }
+ while (1) {
+ int terminate = 0;
+ reiser4_key found;
+ reiser4_key sdkey;
+ struct inode *inode;
+ /*
+ * look for an object found in previous iteration
+ */
+ ret = coord_by_key(meta_subvol_tree(), &ictx.curr,
+ &coord, &lh, ZNODE_READ_LOCK,
+ FIND_EXACT,
+ TWIG_LEVEL, TWIG_LEVEL,
+ CBK_UNIQUE, NULL /* read-ahead info */);
+ if (IS_CBKERR(ret)) {
+ done_lh(&lh);
+ warning("edward-1886",
+ "cbk error when balancing (%d)", ret);
+ goto error;
+ }
+ ret = zload(coord.node);
+ if (ret) {
+ done_lh(&lh);
+ goto error;
+ }
+ if (!coord_is_existing_item(&coord) ||
+ !keyeq(item_key_by_coord(&coord, &found), &ictx.curr)) {
+
+ zrelse(coord.node);
+ /*
+ * object found at previous iteration is absent
+ * (truncated by concurrent process), thus current
+ * position is an item with key <= @ictx.curr,
+ * that is, we found an object, which was already
+ * processed, so we just need to find next extent,
+ * reset &ictx.curr and proceed
+ */
+ ret = reiser4_iterate_tree(meta_subvol_tree(),
+ &coord, &lh,
+ iter_find_start, &ictx,
+ ZNODE_READ_LOCK, 0);
+ if (ret < 0) {
+ done_lh(&lh);
+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
+ break;
+ goto error;
+ }
+ } else
+ zrelse(coord.node);
+ /*
+ * find leftmost extent of the next file and store
+ * its key as "next" in the iteration context as a
+ * hint for next iteration
+ */
+ assert("edward-1887",
+ WITH_DATA(coord.node, coord_is_existing_item(&coord)));
+
+ ret = reiser4_iterate_tree(meta_subvol_tree(), &coord,
+ &lh, iter_find_next,
+ &ictx, ZNODE_READ_LOCK, 0);
+ done_lh(&lh);
+
+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
+ /*
+ * next extent not found
+ */
+ terminate = 1;
+ else if (ret < 0)
+ goto error;
+ /*
+ * construct stat-data key from the "current" key of iteration
+ * context and read the inode.
+ * We don't know actual ordering component of stat-data key,
+ * so we set a maximal one to make sure that search procedure
+ * will find it correctly. Also stat-data we are looking for
+ * can be killed by concurrent unlink(). In this case a "bad
+ * inode" is created.
+ */
+ sdkey = ictx.curr;
+ set_key_ordering(&sdkey, KEY_ORDERING_MASK /* max ordering */);
+ set_key_type(&sdkey, KEY_SD_MINOR);
+ set_key_offset(&sdkey, 0);
+
+ inode = reiser4_iget(super, &sdkey, FIND_MAX_NOT_MORE_THAN, 0);
+ if (IS_ERR(inode))
+ /*
+ * file was removed
+ */
+ goto next;
+ if (file_is_migratable(inode, super, flags)) {
+ reiser4_iget_complete(inode);
+ /*
+ * migrate data blocks of this file
+ */
+ ret = inode_file_plugin(inode)->migrate(inode, NULL);
+ if (ret) {
+ iput(inode);
+ warning("edward-1889",
+ "Inode %lli: data migration failed (%d)",
+ (unsigned long long)get_inode_oid(inode),
+ ret);
+ goto error;
+ }
+ if (flags & VBF_CLR_IMMOBILE) {
+ ret = inode_clr_immobile(inode);
+ if (ret)
+ warning("edward-2472",
+ "Inode %lli: failed to clear immobile status(%d)",
+ (unsigned long long)get_inode_oid(inode),
+ ret);
+ }
+ }
+ iput(inode);
+ next:
+ if (terminate)
+ break;
+ ictx.curr = ictx.next;
+ }
+ done:
+ printk("reiser4 (%s): Balancing completed in %lld seconds.\n",
+ super->s_id, ktime_get_seconds() - start);
+ return 0;
+ error:
+ warning("edward-2155", "%s: Balancing aborted (%d).", super->s_id, ret);
+ return ret == -E_DEADLOCK ? -EAGAIN : ret;
+}
+
+volume_plugin volume_plugins[LAST_VOLUME_ID] = {
+ [SIMPLE_VOLUME_ID] = {
+ .h = {
+ .type_id = REISER4_VOLUME_PLUGIN_TYPE,
+ .id = SIMPLE_VOLUME_ID,
+ .pops = NULL,
+ .label = "simple",
+ .desc = "Simple Logical Volume",
+ .linkage = {NULL, NULL}
+ },
+ .meta_subvol_id = meta_subvol_id_simple,
+ .calc_brick = calc_brick_simple,
+ .find_brick = find_brick_simple,
+ .load_volume = NULL,
+ .done_volume = NULL,
+ .init_volume = init_volume_simple,
+ .resize_brick = resize_brick_simple,
+ .add_brick = add_brick_simple,
+ .remove_brick = remove_brick_simple,
+ .remove_brick_tail = NULL,
+ .print_brick = print_brick_simple,
+ .print_volume = print_volume_simple,
+ .balance_volume = balance_volume_simple,
+ .bucket_ops = {
+ .cap_at = NULL,
+ .apx_of = NULL,
+ .apx_at = NULL,
+ .apx_set_at = NULL,
+ .apx_lenp_at = NULL
+ }
+ },
+ [ASYM_VOLUME_ID] = {
+ .h = {
+ .type_id = REISER4_VOLUME_PLUGIN_TYPE,
+ .id = ASYM_VOLUME_ID,
+ .pops = NULL,
+ .label = "asym",
+ .desc = "Asymmetric Heterogeneous Logical Volume",
+ .linkage = {NULL, NULL}
+ },
+ .meta_subvol_id = meta_subvol_id_simple,
+ .calc_brick = calc_brick_asym,
+ .find_brick = find_brick_asym,
+ .load_volume = load_volume_asym,
+ .done_volume = done_volume_asym,
+ .init_volume = init_volume_asym,
+ .resize_brick = resize_brick_asym,
+ .add_brick = add_brick_asym,
+ .remove_brick = remove_brick_asym,
+ .remove_brick_tail = remove_brick_tail_asym,
+ .print_brick = print_brick_asym,
+ .print_volume = print_volume_asym,
+ .scale_volume = scale_volume_asym,
+ .migrate_file = migrate_file_asym,
+ .balance_volume = balance_volume_asym,
+ .bucket_ops = {
+ .cap_at = cap_at_asym,
+ .apx_of = apx_of_asym,
+ .apx_at = apx_at_asym,
+ .apx_set_at = apx_set_at_asym,
+ .apx_lenp_at = apx_lenp_at_asym,
+ .idx2id = idx2id,
+ .id2idx = id2idx,
+ .create_buckets = create_buckets,
+ .free_buckets = free_buckets,
+ .insert_bucket = insert_bucket,
+ .remove_bucket = remove_bucket,
+ .space_occupied = space_occupied
+ }
+ }
+};
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/plugin/volume/volume.h linux-5.10.2/fs/reiser4/plugin/volume/volume.h
--- linux-5.10.2.orig/fs/reiser4/plugin/volume/volume.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/plugin/volume/volume.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,147 @@
+/*
+ Copyright (c) 2014-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef VOLUME_H
+#define VOLUME_H
+
+#define INVALID_SUBVOL_ID (0xffffffff)
+#define METADATA_SUBVOL_ID (0)
+
+extern void deactivate_subvol(struct super_block *super, reiser4_subvol *subv);
+extern reiser4_subvol *find_meta_brick_by_id(reiser4_volume *vol);
+extern lv_conf *alloc_lv_conf(u32 nr_slots);
+extern void free_lv_conf(lv_conf *conf);
+extern void release_volinfo_nodes(reiser4_volinfo *vinfo, int dealloc);
+extern slot_t alloc_mslot(u32 nr_mirrors);
+extern void free_mslot(slot_t slot);
+extern void free_mslot_at(lv_conf *conf, u64 idx);
+extern int brick_belongs_volume(reiser4_volume *vol, reiser4_subvol *subv);
+extern int remove_brick_tail_asym(reiser4_volume *vol, reiser4_subvol *subv);
+extern reiser4_block_nr estimate_migration_iter(void);
+
+static inline int is_meta_brick_id(u64 id)
+{
+ return id == METADATA_SUBVOL_ID;
+}
+
+static inline int is_meta_brick(reiser4_subvol *this)
+{
+ assert("edward-2189", subvol_is_set(this, SUBVOL_ACTIVATED));
+ assert("edward-2071", ergo(is_meta_brick_id(this->id),
+ this == get_meta_subvol()));
+ return is_meta_brick_id(this->id);
+}
+
+static inline u64 get_pos_in_dsa(reiser4_subvol *subv)
+{
+ return subv->dsa_idx;
+}
+
+static inline int is_proxy_brick(reiser4_subvol *subv)
+{
+ return subvol_is_set(subv, SUBVOL_IS_PROXY);
+}
+
+/*
+ * Returns true, if @subv participates in regular data distribution
+ */
+static inline int is_dsa_brick(reiser4_subvol *subv)
+{
+ assert("edward-2475",
+ ergo(is_proxy_brick(subv),
+ !subvol_is_set(subv, SUBVOL_HAS_DATA_ROOM)));
+ assert("edward-2476",
+ ergo(subvol_is_set(subv, SUBVOL_HAS_DATA_ROOM),
+ !is_proxy_brick(subv)));
+
+ return subvol_is_set(subv, SUBVOL_HAS_DATA_ROOM);
+}
+
+/*
+ * Returns true, if meta-data subvolume participates in regular
+ * data distribution. Otherwise, returns false
+ */
+static inline int meta_brick_belongs_dsa(void)
+{
+ return is_dsa_brick(get_meta_subvol());
+}
+
+/**
+ * Without scanning all bricks i the volume, calculate and return number
+ * of bricks participating in regular data distribution
+ *
+ * Possible cases:
+ *
+ * 1 xxxxxxxxxx nr_origins
+ *
+ * 2 oxxxxxxxxx nr_origins - 1
+ *
+ * 3 xxxxxxxxxx nr_origins - 1
+ * ^
+ * 4 oxxxxxxxxx nr_origins - 1
+ * ^
+ * 5 xxxxxxxxxx nr_origins - 1
+ * ^
+ * 6 oxxxxxxxxx nr_origins - 2
+ * ^
+ *
+ * Legend:
+ *
+ * o: meta-brick w/o data room
+ * x: data brick, or meta-brick w/ data room
+ * ^: proxy brick
+ */
+static inline u64 num_dsa_subvols(reiser4_volume *vol)
+{
+ if (!reiser4_is_set(reiser4_get_current_sb(), REISER4_PROXY_ENABLED))
+ /* 1, 2 */
+ return meta_brick_belongs_dsa() ?
+ vol_nr_origins(vol) : vol_nr_origins(vol) - 1;
+
+ if (subvol_is_set(get_meta_subvol(), SUBVOL_IS_PROXY))
+ /* 3, 4 */
+ return vol_nr_origins(vol) - 1;
+ /* 5, 6 */
+ return meta_brick_belongs_dsa() ?
+ vol_nr_origins(vol) - 1 : vol_nr_origins(vol) - 2;
+}
+
+static inline reiser4_subvol *subvol_by_key(const reiser4_key *key)
+{
+ return current_origin(get_key_ordering(key));
+}
+
+static inline int reserve_migration_iter(void)
+{
+ grab_space_enable();
+ return reiser4_grab_reserved(reiser4_get_current_sb(),
+ estimate_migration_iter(),
+ BA_CAN_COMMIT,
+ get_meta_subvol());
+}
+
+static inline reiser4_subvol *get_proxy_subvol(void)
+{
+ assert("edward-2441", current_volume()->proxy != NULL);
+
+ return current_volume()->proxy;
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
+
+#endif /* VOLUME_H */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/pool.c linux-5.10.2/fs/reiser4/pool.c
--- linux-5.10.2.orig/fs/reiser4/pool.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/pool.c 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,231 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Fast pool allocation.
+
+ There are situations when some sub-system normally asks memory allocator
+ for only few objects, but under some circumstances could require much
+ more. Typical and actually motivating example is tree balancing. It needs
+ to keep track of nodes that were involved into it, and it is well-known
+ that in reasonable packed balanced tree most (92.938121%) percent of all
+ balancings end up after working with only few nodes (3.141592 on
+ average). But in rare cases balancing can involve much more nodes
+ (3*tree_height+1 in extremal situation).
+
+ On the one hand, we don't want to resort to dynamic allocation (slab,
+ malloc(), etc.) to allocate data structures required to keep track of
+ nodes during balancing. On the other hand, we cannot statically allocate
+ required amount of space on the stack, because first: it is useless wastage
+ of precious resource, and second: this amount is unknown in advance (tree
+ height can change).
+
+ Pools, implemented in this file are solution for this problem:
+
+ - some configurable amount of objects is statically preallocated on the
+ stack
+
+ - if this preallocated pool is exhausted and more objects is requested
+ they are allocated dynamically.
+
+ Pools encapsulate distinction between statically and dynamically allocated
+ objects. Both allocation and recycling look exactly the same.
+
+ To keep track of dynamically allocated objects, pool adds its own linkage
+ to each object.
+
+ NOTE-NIKITA This linkage also contains some balancing-specific data. This
+ is not perfect. On the other hand, balancing is currently the only client
+ of pool code.
+
+ NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
+ functions in the style of tslist/tshash, i.e., make them unreadable, but
+ type-safe.
+
+*/
+
+#include "debug.h"
+#include "pool.h"
+#include "super.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+/* initialize new pool object @h */
+static void reiser4_init_pool_obj(struct reiser4_pool_header *h)
+{
+ INIT_LIST_HEAD(&h->usage_linkage);
+ INIT_LIST_HEAD(&h->level_linkage);
+ INIT_LIST_HEAD(&h->extra_linkage);
+}
+
+/* initialize new pool */
+void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ ,
+ size_t obj_size /* size of objects in @pool */ ,
+ int num_of_objs /* number of preallocated objects */ ,
+ char *data/* area for preallocated objects */)
+{
+ struct reiser4_pool_header *h;
+ int i;
+
+ assert("nikita-955", pool != NULL);
+ assert("nikita-1044", obj_size > 0);
+ assert("nikita-956", num_of_objs >= 0);
+ assert("nikita-957", data != NULL);
+
+ memset(pool, 0, sizeof *pool);
+ pool->obj_size = obj_size;
+ pool->data = data;
+ INIT_LIST_HEAD(&pool->free);
+ INIT_LIST_HEAD(&pool->used);
+ INIT_LIST_HEAD(&pool->extra);
+ memset(data, 0, obj_size * num_of_objs);
+ for (i = 0; i < num_of_objs; ++i) {
+ h = (struct reiser4_pool_header *) (data + i * obj_size);
+ reiser4_init_pool_obj(h);
+ /* add pool header to the end of pool's free list */
+ list_add_tail(&h->usage_linkage, &pool->free);
+ }
+}
+
+/* release pool resources
+
+ Release all resources acquired by this pool, specifically, dynamically
+ allocated objects.
+
+*/
+void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG)
+{
+}
+
+/* allocate carry object from @pool
+
+ First, try to get preallocated object. If this fails, resort to dynamic
+ allocation.
+
+*/
+static void *reiser4_pool_alloc(struct reiser4_pool *pool)
+{
+ struct reiser4_pool_header *result;
+
+ assert("nikita-959", pool != NULL);
+
+ if (!list_empty(&pool->free)) {
+ struct list_head *linkage;
+
+ linkage = pool->free.next;
+ list_del(linkage);
+ INIT_LIST_HEAD(linkage);
+ result = list_entry(linkage, struct reiser4_pool_header,
+ usage_linkage);
+ BUG_ON(!list_empty(&result->level_linkage) ||
+ !list_empty(&result->extra_linkage));
+ } else {
+ /* pool is empty. Extra allocations don't deserve dedicated
+ slab to be served from, as they are expected to be rare. */
+ result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
+ if (result != 0) {
+ reiser4_init_pool_obj(result);
+ list_add(&result->extra_linkage, &pool->extra);
+ } else
+ return ERR_PTR(RETERR(-ENOMEM));
+ BUG_ON(!list_empty(&result->usage_linkage) ||
+ !list_empty(&result->level_linkage));
+ }
+ ++pool->objs;
+ list_add(&result->usage_linkage, &pool->used);
+ memset(result + 1, 0, pool->obj_size - sizeof *result);
+ return result;
+}
+
+/* return object back to the pool */
+void reiser4_pool_free(struct reiser4_pool *pool,
+ struct reiser4_pool_header *h)
+{
+ assert("nikita-961", h != NULL);
+ assert("nikita-962", pool != NULL);
+
+ --pool->objs;
+ assert("nikita-963", pool->objs >= 0);
+
+ list_del_init(&h->usage_linkage);
+ list_del_init(&h->level_linkage);
+
+ if (list_empty(&h->extra_linkage))
+ /*
+ * pool header is not an extra one. Push it onto free list
+ * using usage_linkage
+ */
+ list_add(&h->usage_linkage, &pool->free);
+ else {
+ /* remove pool header from pool's extra list and kfree it */
+ list_del(&h->extra_linkage);
+ kfree(h);
+ }
+}
+
+/* add new object to the carry level list
+
+ Carry level is FIFO most of the time, but not always. Complications arise
+ when make_space() function tries to go to the left neighbor and thus adds
+ carry node before existing nodes, and also, when updating delimiting keys
+ after moving data between two nodes, we want left node to be locked before
+ right node.
+
+ Latter case is confusing at the first glance. Problem is that COP_UPDATE
+ opration that updates delimiting keys is sometimes called with two nodes
+ (when data are moved between two nodes) and sometimes with only one node
+ (when leftmost item is deleted in a node). In any case operation is
+ supplied with at least node whose left delimiting key is to be updated
+ (that is "right" node).
+
+ @pool - from which to allocate new object;
+ @list - where to add object;
+ @reference - after (or before) which existing object to add
+*/
+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
+ struct list_head *list,
+ pool_ordering order,
+ struct reiser4_pool_header *reference)
+{
+ struct reiser4_pool_header *result;
+
+ assert("nikita-972", pool != NULL);
+
+ result = reiser4_pool_alloc(pool);
+ if (IS_ERR(result))
+ return result;
+
+ assert("nikita-973", result != NULL);
+
+ switch (order) {
+ case POOLO_BEFORE:
+ __list_add(&result->level_linkage,
+ reference->level_linkage.prev,
+ &reference->level_linkage);
+ break;
+ case POOLO_AFTER:
+ __list_add(&result->level_linkage,
+ &reference->level_linkage,
+ reference->level_linkage.next);
+ break;
+ case POOLO_LAST:
+ list_add_tail(&result->level_linkage, list);
+ break;
+ case POOLO_FIRST:
+ list_add(&result->level_linkage, list);
+ break;
+ default:
+ wrong_return_value("nikita-927", "order");
+ }
+ return result;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/pool.h linux-5.10.2/fs/reiser4/pool.h
--- linux-5.10.2.orig/fs/reiser4/pool.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/pool.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,57 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Fast pool allocation */
+
+#ifndef __REISER4_POOL_H__
+#define __REISER4_POOL_H__
+
+#include <linux/types.h>
+
+struct reiser4_pool {
+ size_t obj_size;
+ int objs;
+ char *data;
+ struct list_head free;
+ struct list_head used;
+ struct list_head extra;
+};
+
+struct reiser4_pool_header {
+ /* object is either on free or "used" lists */
+ struct list_head usage_linkage;
+ struct list_head level_linkage;
+ struct list_head extra_linkage;
+};
+
+typedef enum {
+ POOLO_BEFORE,
+ POOLO_AFTER,
+ POOLO_LAST,
+ POOLO_FIRST
+} pool_ordering;
+
+/* pool manipulation functions */
+
+extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size,
+ int num_of_objs, char *data);
+extern void reiser4_done_pool(struct reiser4_pool *pool);
+extern void reiser4_pool_free(struct reiser4_pool *pool,
+ struct reiser4_pool_header *h);
+struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
+ struct list_head *list,
+ pool_ordering order,
+ struct reiser4_pool_header *reference);
+
+/* __REISER4_POOL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/readahead.c linux-5.10.2/fs/reiser4/readahead.c
--- linux-5.10.2.orig/fs/reiser4/readahead.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/readahead.c 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,140 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "forward.h"
+#include "tree.h"
+#include "tree_walk.h"
+#include "super.h"
+#include "inode.h"
+#include "key.h"
+#include "znode.h"
+
+#include <linux/swap.h> /* for totalram_pages */
+
+void reiser4_init_ra_info(ra_info_t *rai)
+{
+ rai->key_to_stop = *reiser4_min_key();
+}
+
+/* global formatted node readahead parameter. It can be set by mount option
+ * -o readahead:NUM:1 */
+static inline int ra_adjacent_only(int flags)
+{
+ return flags & RA_ADJACENT_ONLY;
+}
+
+/* this is used by formatted_readahead to decide whether read for right neighbor
+ * of node is to be issued. It returns 1 if right neighbor's first key is less
+ * or equal to readahead's stop key */
+static int should_readahead_neighbor(znode * node, ra_info_t *info)
+{
+ int result;
+
+ read_lock_dk(znode_get_tree(node));
+ result = keyle(znode_get_rd_key(node), &info->key_to_stop);
+ read_unlock_dk(znode_get_tree(node));
+ return result;
+}
+
+#define LOW_MEM_PERCENTAGE (5)
+
+static int low_on_memory(void)
+{
+ unsigned int freepages;
+
+ freepages = nr_free_pages();
+ return freepages < (totalram_pages() * LOW_MEM_PERCENTAGE / 100);
+}
+
+/* start read for @node and for a few of its right neighbors */
+void formatted_readahead(znode * node, ra_info_t *info)
+{
+ struct formatted_ra_params *ra_params;
+ znode *cur;
+ int i;
+ int grn_flags;
+ lock_handle next_lh;
+
+ /* do nothing if node block number has not been assigned to node (which
+ * means it is still in cache). */
+ if (reiser4_blocknr_is_fake(znode_get_block(node)))
+ return;
+
+ ra_params = get_current_super_ra_params();
+
+ if (znode_page(node) == NULL)
+ jstartio(ZJNODE(node));
+
+ if (znode_get_level(node) != LEAF_LEVEL)
+ return;
+
+ /* don't waste memory for read-ahead when low on memory */
+ if (low_on_memory())
+ return;
+
+ /* We can have locked nodes on upper tree levels, in this situation lock
+ priorities do not help to resolve deadlocks, we have to use TRY_LOCK
+ here. */
+ grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
+
+ i = 0;
+ cur = zref(node);
+ init_lh(&next_lh);
+ while (i < ra_params->max) {
+ const reiser4_block_nr * nextblk;
+
+ if (!should_readahead_neighbor(cur, info))
+ break;
+
+ if (reiser4_get_right_neighbor
+ (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
+ break;
+
+ nextblk = znode_get_block(next_lh.node);
+ if (reiser4_blocknr_is_fake(nextblk) ||
+ (ra_adjacent_only(ra_params->flags)
+ && *nextblk != *znode_get_block(cur) + 1))
+ break;
+
+ zput(cur);
+ cur = zref(next_lh.node);
+ done_lh(&next_lh);
+ if (znode_page(cur) == NULL)
+ jstartio(ZJNODE(cur));
+ else
+ /* Do not scan read-ahead window if pages already
+ * allocated (and i/o already started). */
+ break;
+
+ i++;
+ }
+ zput(cur);
+ done_lh(&next_lh);
+}
+
+void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap)
+{
+ reiser4_key *stop_key;
+
+ assert("nikita-3542", dir != NULL);
+ assert("nikita-3543", tap != NULL);
+
+ stop_key = &tap->ra_info.key_to_stop;
+ /* initialize readdir readahead information: include into readahead
+ * stat data of all files of the directory */
+ set_key_locality(stop_key, get_inode_oid(dir));
+ set_key_type(stop_key, KEY_SD_MINOR);
+ set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
+ set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
+ set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/readahead.h linux-5.10.2/fs/reiser4/readahead.h
--- linux-5.10.2.orig/fs/reiser4/readahead.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/readahead.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,42 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#ifndef __READAHEAD_H__
+#define __READAHEAD_H__
+
+#include "key.h"
+
+typedef enum {
+ RA_ADJACENT_ONLY = 1, /* only requests nodes which are adjacent.
+ Default is NO (not only adjacent) */
+} ra_global_flags;
+
+/* reiser4 super block has a field of this type.
+ It controls readahead during tree traversals */
+struct formatted_ra_params {
+ unsigned long max; /* request not more than this amount of nodes.
+ Default is totalram_pages() / 4 */
+ int flags;
+};
+
+typedef struct {
+ reiser4_key key_to_stop;
+} ra_info_t;
+
+void formatted_readahead(znode * , ra_info_t *);
+void reiser4_init_ra_info(ra_info_t *rai);
+
+extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap);
+
+/* __READAHEAD_H__ */
+#endif
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/README linux-5.10.2/fs/reiser4/README
--- linux-5.10.2.orig/fs/reiser4/README 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/README 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,128 @@
+[LICENSING]
+
+Reiser4 is hereby licensed under the GNU General
+Public License version 2.
+
+Source code files that contain the phrase "licensing governed by
+reiser4/README" are "governed files" throughout this file. Governed
+files are licensed under the GPL. The portions of them owned by Hans
+Reiser, or authorized to be licensed by him, have been in the past,
+and likely will be in the future, licensed to other parties under
+other licenses. If you add your code to governed files, and don't
+want it to be owned by Hans Reiser, put your copyright label on that
+code so the poor blight and his customers can keep things straight.
+All portions of governed files not labeled otherwise are owned by Hans
+Reiser, and by adding your code to it, widely distributing it to
+others or sending us a patch, and leaving the sentence in stating that
+licensing is governed by the statement in this file, you accept this.
+It will be a kindness if you identify whether Hans Reiser is allowed
+to license code labeled as owned by you on your behalf other than
+under the GPL, because he wants to know if it is okay to do so and put
+a check in the mail to you (for non-trivial improvements) when he
+makes his next sale. He makes no guarantees as to the amount if any,
+though he feels motivated to motivate contributors, and you can surely
+discuss this with him before or after contributing. You have the
+right to decline to allow him to license your code contribution other
+than under the GPL.
+
+Further licensing options are available for commercial and/or other
+interests directly from Hans Reiser: reiser@namesys.com. If you interpret
+the GPL as not allowing those additional licensing options, you read
+it wrongly, and Richard Stallman agrees with me, when carefully read
+you can see that those restrictions on additional terms do not apply
+to the owner of the copyright, and my interpretation of this shall
+govern for this license.
+
+[END LICENSING]
+
+Reiser4 is a file system based on dancing tree algorithms, and is
+described at http://www.namesys.com
+
+mkfs.reiser4 and other utilities are on our webpage or wherever your
+Linux provider put them. You really want to be running the latest
+version off the website if you use fsck.
+
+Yes, if you update your reiser4 kernel module you do have to
+recompile your kernel, most of the time. The errors you get will be
+quite cryptic if your forget to do so.
+
+Hideous Commercial Pitch: Spread your development costs across other OS
+vendors. Select from the best in the world, not the best in your
+building, by buying from third party OS component suppliers. Leverage
+the software component development power of the internet. Be the most
+aggressive in taking advantage of the commercial possibilities of
+decentralized internet development, and add value through your branded
+integration that you sell as an operating system. Let your competitors
+be the ones to compete against the entire internet by themselves. Be
+hip, get with the new economic trend, before your competitors do. Send
+email to reiser@namesys.com
+
+Hans Reiser was the primary architect of Reiser4, but a whole team
+chipped their ideas in. He invested everything he had into Namesys
+for 5.5 dark years of no money before Reiser3 finally started to work well
+enough to bring in money. He owns the copyright.
+
+DARPA was the primary sponsor of Reiser4. DARPA does not endorse
+Reiser4, it merely sponsors it. DARPA is, in solely Hans's personal
+opinion, unique in its willingness to invest into things more
+theoretical than the VC community can readily understand, and more
+longterm than allows them to be sure that they will be the ones to
+extract the economic benefits from. DARPA also integrated us into a
+security community that transformed our security worldview.
+
+Vladimir Saveliev is our lead programmer, with us from the beginning,
+and he worked long hours writing the cleanest code. This is why he is
+now the lead programmer after years of commitment to our work. He
+always made the effort to be the best he could be, and to make his
+code the best that it could be. What resulted was quite remarkable. I
+don't think that money can ever motivate someone to work the way he
+did, he is one of the most selfless men I know.
+
+Alexander Lyamin was our sysadmin, and helped to educate us in
+security issues. Moscow State University and IMT were very generous
+in the internet access they provided us, and in lots of other little
+ways that a generous institution can be.
+
+Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
+locking code, the block allocator, and finished the flushing code.
+His code is always crystal clean and well structured.
+
+Nikita Danilov wrote the core of the balancing code, the core of the
+plugins code, and the directory code. He worked a steady pace of long
+hours that produced a whole lot of well abstracted code. He is our
+senior computer scientist.
+
+Vladimir Demidov wrote the parser. Writing an in kernel parser is
+something very few persons have the skills for, and it is thanks to
+him that we can say that the parser is really not so big compared to
+various bits of our other code, and making a parser work in the kernel
+was not so complicated as everyone would imagine mainly because it was
+him doing it...
+
+Joshua McDonald wrote the transaction manager, and the flush code.
+The flush code unexpectedly turned out be extremely hairy for reasons
+you can read about on our web page, and he did a great job on an
+extremely difficult task.
+
+Nina Reiser handled our accounting, government relations, and much
+more.
+
+Ramon Reiser developed our website.
+
+Beverly Palmer drew our graphics.
+
+Vitaly Fertman developed librepair, userspace plugins repair code, fsck
+and worked with Umka on developing libreiser4 and userspace plugins.
+
+Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
+userspace tools (reiser4progs).
+
+Oleg Drokin (aka Green) is the release manager who fixes everything.
+It is so nice to have someone like that on the team. He (plus Chris
+and Jeff) make it possible for the entire rest of the Namesys team to
+focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also. It
+is just amazing to watch his talent for spotting bugs in action.
+
+Edward Shishkin wrote cryptcompress file plugin (which manages files
+built of encrypted and(or) compressed bodies) and other plugins related
+to transparent encryption and compression support.
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/reiser4.h linux-5.10.2/fs/reiser4/reiser4.h
--- linux-5.10.2.orig/fs/reiser4/reiser4.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/reiser4.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,274 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+
+/* definitions of common constants used by reiser4 */
+
+#if !defined( __REISER4_H__ )
+#define __REISER4_H__
+
+#include <asm/param.h> /* for HZ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+/*
+ * reiser4 compilation options.
+ */
+
+#if defined(CONFIG_REISER4_DEBUG)
+/* turn on assertion checks */
+#define REISER4_DEBUG (1)
+#else
+#define REISER4_DEBUG (0)
+#endif
+
+#define REISER4_SHA256 (0)
+
+#define REISER4_VERSION_PRINCIPAL (5)
+
+/*
+ * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
+ * 8-byte components. In the old "small key" mode, it's 3 8-byte
+ * components. Additional component, referred to as "ordering" is used to
+ * order items from which given object is composed of. As such, ordering is
+ * placed between locality and objectid. For directory item ordering contains
+ * initial prefix of the file name this item is for. This sorts all directory
+ * items within given directory lexicographically (but see
+ * fibration.[ch]). For file body and stat-data, ordering contains initial
+ * prefix of the name file was initially created with. In the common case
+ * (files with single name) this allows to order file bodies and stat-datas in
+ * the same order as their respective directory entries, thus speeding up
+ * readdir.
+ *
+ * Note, that kernel can only mount file system with the same key size as one
+ * it is compiled for, so flipping this option may render your data
+ * inaccessible.
+ */
+#define REISER4_LARGE_KEY (1)
+/*#define REISER4_LARGE_KEY (0)*/
+
+/*#define GUESS_EXISTS 1*/
+
+/*
+ * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
+ * option
+ */
+
+#define REISER4_SUPER_MAGIC_STRING "ReIsEr4"
+extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
+ * beginning of device */
+
+/* here go tunable parameters that are not worth special entry in kernel
+ configuration */
+
+/* default number of slots in coord-by-key caches */
+#define CBK_CACHE_SLOTS (16)
+/* how many elementary tree operation to carry on the next level */
+#define CARRIES_POOL_SIZE (5)
+/* size of pool of preallocated nodes for carry process. */
+#define NODES_LOCKED_POOL_SIZE (5)
+
+#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
+#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
+#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
+#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
+
+/* we are supporting reservation of disk space on uid basis */
+#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
+/* we are supporting reservation of disk space for groups */
+#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
+/* we are supporting reservation of disk space for root */
+#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
+/* we use rapid flush mode, see flush.c for comments. */
+#define REISER4_USE_RAPID_FLUSH (1)
+
+/*
+ * set this to 0 if you don't want to use wait-for-flush in ->writepage().
+ */
+#define REISER4_USE_ENTD (1)
+
+#if defined(CONFIG_REISER4_OLD)
+/* key allocation is Plan-A */
+#define REISER4_PLANA_KEY_ALLOCATION (1)
+#define REISER4_PLANB_KEY_ALLOCATION (0)
+#else
+/* key allocation is Plan-B */
+#define REISER4_PLANA_KEY_ALLOCATION (0)
+#define REISER4_PLANB_KEY_ALLOCATION (1)
+#endif
+
+/* key allocation follows good old 3.x scheme */
+#define REISER4_3_5_KEY_ALLOCATION (0)
+
+/* size of hash-table for znodes */
+#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
+
+/* number of buckets in lnode hash-table */
+#define LNODE_HTABLE_BUCKETS (1024)
+
+/* some ridiculously high maximal limit on height of znode tree. This
+ is used in declaration of various per level arrays and
+ to allocate stattistics gathering array for per-level stats. */
+#define REISER4_MAX_ZTREE_HEIGHT (8)
+
+#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
+
+/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
+ sequential search is on average faster than binary. This is because
+ of better optimization and because sequential search is more CPU
+ cache friendly. This number (25) was found by experiments on dual AMD
+ Athlon(tm), 1400MHz.
+
+ NOTE: testing in kernel has shown that binary search is more effective than
+ implied by results of the user level benchmarking. Probably because in the
+ node keys are separated by other data. So value was adjusted after few
+ tests. More thorough tuning is needed.
+*/
+#define REISER4_SEQ_SEARCH_BREAK (3)
+static_assert(REISER4_SEQ_SEARCH_BREAK > 2);
+
+/* don't allow tree to be lower than this */
+#define REISER4_MIN_TREE_HEIGHT (TWIG_LEVEL)
+
+/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
+ * available memory. */
+/* Default value of maximal atom size. Can be ovewritten by
+ tmgr.atom_max_size mount option. By default infinity. */
+#define REISER4_ATOM_MAX_SIZE ((unsigned)(~0))
+
+/* Default value of maximal atom age (in jiffies). After reaching this age
+ atom will be forced to commit, either synchronously or asynchronously. Can
+ be overwritten by tmgr.atom_max_age mount option. */
+#define REISER4_ATOM_MAX_AGE (600 * HZ)
+
+/* sleeping period for ktxnmrgd */
+#define REISER4_TXNMGR_TIMEOUT (5 * HZ)
+
+/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
+#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
+
+/* start complaining after that many restarts in coord_by_key().
+
+ This either means incredibly heavy contention for this part of a tree, or
+ some corruption or bug.
+*/
+#define REISER4_CBK_ITERATIONS_LIMIT (100)
+
+/* return -EIO after that many iterations in coord_by_key().
+
+ I have witnessed more than 800 iterations (in 30 thread test) before cbk
+ finished. --nikita
+*/
+#define REISER4_MAX_CBK_ITERATIONS 500000
+
+/* put a per-inode limit on maximal number of directory entries with identical
+ keys in hashed directory.
+
+ Disable this until inheritance interfaces stabilize: we need some way to
+ set per directory limit.
+*/
+#define REISER4_USE_COLLISION_LIMIT (0)
+
+/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level
+ blocks it will force them to be relocated. */
+#define FLUSH_RELOCATE_THRESHOLD 64
+/* If flush finds can find a block allocation closer than at most
+ FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position.
+ */
+#define FLUSH_RELOCATE_DISTANCE 64
+
+/* If we have written this much or more blocks before encountering busy jnode
+ in flush list - abort flushing hoping that next time we get called
+ this jnode will be clean already, and we will save some seeks. */
+#define FLUSH_WRITTEN_THRESHOLD 50
+
+/* The maximum number of nodes to scan left on a level during flush. */
+#define FLUSH_SCAN_MAXNODES 10000
+
+/* per-atom limit of flushers */
+#define ATOM_MAX_FLUSHERS (1)
+
+/* default tracing buffer size */
+#define REISER4_TRACE_BUF_SIZE (1 << 15)
+
+/* what size units of IO we would like cp, etc., to use, in writing to
+ reiser4. In bytes.
+
+ Can be overwritten by optimal_io_size mount option.
+*/
+#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
+
+/* see comments in inode.c:oid_to_uino() */
+#define REISER4_UINO_SHIFT (1 << 30)
+
+/* Mark function argument as unused to avoid compiler warnings. */
+#define UNUSED_ARG __attribute__((unused))
+
+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
+#define NONNULL __attribute__((nonnull))
+#else
+#define NONNULL
+#endif
+
+/* master super block offset in bytes.*/
+#define REISER4_MASTER_OFFSET 65536
+
+/* size of VFS block */
+#define VFS_BLKSIZE 512
+/* number of bits in size of VFS block (512==2^9) */
+#define VFS_BLKSIZE_BITS 9
+
+#define REISER4_I reiser4_inode_data
+
+/* implication */
+#define ergo(antecedent, consequent) (!(antecedent) || (consequent))
+/* logical equivalence */
+#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1)))
+
+#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
+
+#define NOT_YET (0)
+
+/** Reiser4 specific error codes **/
+
+#define REISER4_ERROR_CODE_BASE 10000
+
+/* Neighbor is not available (side neighbor or parent) */
+#define E_NO_NEIGHBOR (REISER4_ERROR_CODE_BASE)
+
+/* Node was not found in cache */
+#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
+
+/* node has no free space enough for completion of balancing operation */
+#define E_NODE_FULL (REISER4_ERROR_CODE_BASE + 2)
+
+/* repeat operation */
+#define E_REPEAT (REISER4_ERROR_CODE_BASE + 3)
+
+/* deadlock happens */
+#define E_DEADLOCK (REISER4_ERROR_CODE_BASE + 4)
+
+/* operation cannot be performed, because it would block and non-blocking mode
+ * was requested. */
+#define E_BLOCK (REISER4_ERROR_CODE_BASE + 5)
+
+/* wait some event (depends on context), then repeat */
+#define E_WAIT (REISER4_ERROR_CODE_BASE + 6)
+
+/* going beyond something */
+#define E_OUTSTEP (REISER4_ERROR_CODE_BASE + 7)
+
+#endif /* __REISER4_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/safe_link.c linux-5.10.2/fs/reiser4/safe_link.c
--- linux-5.10.2.orig/fs/reiser4/safe_link.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/safe_link.c 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,355 @@
+/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Safe-links. */
+
+/*
+ * Safe-links are used to maintain file system consistency during operations
+ * that spawns multiple transactions. For example:
+ *
+ * 1. Unlink. UNIX supports "open-but-unlinked" files, that is files
+ * without user-visible names in the file system, but still opened by some
+ * active process. What happens here is that unlink proper (i.e., removal
+ * of the last file name) and file deletion (truncate of file body to zero
+ * and deletion of stat-data, that happens when last file descriptor is
+ * closed), may belong to different transactions T1 and T2. If a crash
+ * happens after T1 commit, but before T2 commit, on-disk file system has
+ * a file without name, that is, disk space leak.
+ *
+ * 2. Truncate. Truncate of large file may spawn multiple transactions. If
+ * system crashes while truncate was in-progress, file is left partially
+ * truncated, which violates "atomicity guarantees" of reiser4, viz. that
+ * every system is atomic.
+ *
+ * Safe-links address both above cases. Basically, safe-link is a way post
+ * some operation to be executed during commit of some other transaction than
+ * current one. (Another way to look at the safe-link is to interpret it as a
+ * logical logging.)
+ *
+ * Specifically, at the beginning of unlink safe-link in inserted in the
+ * tree. This safe-link is normally removed by file deletion code (during
+ * transaction T2 in the above terms). Truncate also inserts safe-link that is
+ * normally removed when truncate operation is finished.
+ *
+ * This means, that in the case of "clean umount" there are no safe-links in
+ * the tree. If safe-links are observed during mount, it means that (a) system
+ * was terminated abnormally, and (b) safe-link correspond to the "pending"
+ * (i.e., not finished) operations that were in-progress during system
+ * termination. Each safe-link record enough information to complete
+ * corresponding operation, and mount simply "replays" them (hence, the
+ * analogy with the logical logging).
+ *
+ * Safe-links are implemented as blackbox items (see
+ * plugin/item/blackbox.[ch]).
+ *
+ * For the reference: ext3 also has similar mechanism, it's called "an orphan
+ * list" there.
+ */
+
+#include "safe_link.h"
+#include "debug.h"
+#include "inode.h"
+
+#include "plugin/item/blackbox.h"
+
+#include <linux/fs.h>
+
+/*
+ * On-disk format of safe-link.
+ */
+typedef struct safelink {
+ reiser4_key sdkey; /* key of stat-data for the file safe-link is
+ * for */
+ d64 size; /* size to which file should be truncated */
+} safelink_t;
+
+/*
+ * locality where safe-link items are stored. Next to the objectid of root
+ * directory.
+ */
+static oid_t safe_link_locality(reiser4_subvol *subv)
+{
+ return get_key_objectid(subv->df_plug->root_dir_key(NULL)) + 1;
+}
+
+/*
+ Construct a key for the safe-link. Key has the following format:
+
+| 60 | 4 | 64 | 4 | 60 | 64 |
++---------------+---+------------------+---+---------------+------------------+
+| locality | 0 | 0 | 0 | objectid | link type |
++---------------+---+------------------+---+---------------+------------------+
+| | | | |
+| 8 bytes | 8 bytes | 8 bytes | 8 bytes |
+
+ This is in large keys format. In small keys format second 8 byte chunk is
+ out. Locality is a constant returned by safe_link_locality(). objectid is
+ an oid of a file on which operation protected by this safe-link is
+ performed. link-type is used to distinguish safe-links for different
+ operations.
+
+ */
+static reiser4_key *build_link_key(reiser4_subvol *subv, oid_t oid,
+ reiser4_safe_link_t link, reiser4_key * key)
+{
+ reiser4_key_init(key);
+ set_key_locality(key, safe_link_locality(subv));
+ set_key_objectid(key, oid);
+ set_key_offset(key, link);
+ return key;
+}
+
+/*
+ * how much disk space is necessary to insert and remove (in the
+ * error-handling path) safe-link.
+ */
+static __u64 safe_link_tograb(reiser4_tree * tree)
+{
+ return
+ /* insert safe link */
+ estimate_one_insert_item(tree) +
+ /* remove safe link */
+ estimate_one_item_removal(tree) +
+ /* drill to the leaf level during insertion */
+ 1 + estimate_one_insert_item(tree) +
+ /*
+ * possible update of existing safe-link. Actually, if
+ * safe-link existed already (we failed to remove it), then no
+ * insertion is necessary, so this term is already "covered",
+ * but for simplicity let's left it.
+ */
+ 1;
+}
+
+/*
+ * grab enough disk space to insert and remove (in the error-handling path)
+ * safe-link.
+ */
+int safe_link_grab(struct super_block *super,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv)
+{
+ int result;
+
+ grab_space_enable();
+ /*
+ * The sbinfo->delete_mutex can be taken here.
+ * safe_link_release() should be called before leaving
+ * reiser4 context
+ */
+ result = reiser4_grab_reserved(super, safe_link_tograb(&subv->tree),
+ flags, subv);
+ grab_space_enable();
+ return result;
+}
+
+/*
+ * release unused disk space reserved by safe_link_grab().
+ */
+void safe_link_release(struct super_block *super)
+{
+ reiser4_release_reserved(super);
+}
+
+/*
+ * insert into tree safe-link for operation @link on inode @inode.
+ */
+int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
+{
+ reiser4_key key;
+ safelink_t sl;
+ int length;
+ int result;
+ reiser4_subvol *subv = get_meta_subvol();
+
+ build_sd_key(inode, &sl.sdkey);
+ length = sizeof sl.sdkey;
+
+ if (link == SAFE_TRUNCATE) {
+ /*
+ * for truncate we have to store final file length also,
+ * expand item.
+ */
+ length += sizeof(sl.size);
+ put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
+ }
+ build_link_key(subv, get_inode_oid(inode), link, &key);
+
+ result = store_black_box(&subv->tree, &key, &sl, length);
+ if (result == -EEXIST)
+ result = update_black_box(&subv->tree, &key, &sl, length);
+ return result;
+}
+
+/*
+ * remove safe-link corresponding to the operation @link on inode @inode from
+ * the tree.
+ */
+int safe_link_del(reiser4_subvol *subv, oid_t oid, reiser4_safe_link_t link)
+{
+ reiser4_key key;
+
+ return kill_black_box(&subv->tree,
+ build_link_key(subv, oid, link, &key));
+}
+
+/*
+ * in-memory structure to keep information extracted from safe-link. This is
+ * used to iterate over all safe-links.
+ */
+struct safe_link_context {
+ reiser4_tree *tree; /* internal tree */
+ reiser4_key key; /* safe-link key */
+ reiser4_key sdkey; /* key of object stat-data */
+ reiser4_safe_link_t link; /* safe-link type */
+ oid_t oid; /* object oid */
+ __u64 size; /* final size for truncate */
+};
+
+/*
+ * start iterating over all safe-links.
+ */
+static void safe_link_iter_begin(reiser4_subvol *subv,
+ struct safe_link_context *ctx)
+{
+ reiser4_key_init(&ctx->key);
+ set_key_locality(&ctx->key, safe_link_locality(subv));
+ set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
+ set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
+}
+
+/*
+ * return next safe-link.
+ */
+static int safe_link_iter_next(reiser4_subvol *subv,
+ struct safe_link_context *ctx)
+{
+ int result;
+ safelink_t sl;
+
+ result = load_black_box(&subv->tree, &ctx->key, &sl, sizeof sl, 0);
+ if (result == 0) {
+ ctx->oid = get_key_objectid(&ctx->key);
+ ctx->link = get_key_offset(&ctx->key);
+ ctx->sdkey = sl.sdkey;
+ if (ctx->link == SAFE_TRUNCATE)
+ ctx->size = le64_to_cpu(get_unaligned(&sl.size));
+ }
+ return result;
+}
+
+/*
+ * check are there any more safe-links left in the tree.
+ */
+static int safe_link_iter_finished(reiser4_subvol *subv,
+ struct safe_link_context *ctx)
+{
+ return get_key_locality(&ctx->key) != safe_link_locality(subv);
+}
+
+/*
+ * finish safe-link iteration.
+ */
+static void safe_link_iter_end(struct safe_link_context *ctx)
+{
+ /* nothing special */
+}
+
+/*
+ * process single safe-link.
+ */
+static int process_safelink(struct super_block *super, reiser4_subvol *subv,
+ reiser4_safe_link_t link, reiser4_key *sdkey,
+ oid_t oid, __u64 size)
+{
+ int result;
+ struct inode *inode;
+
+ /*
+ * obtain object inode by reiser4_iget(), then call object plugin
+ * ->safelink() method to do actual work, then delete safe-link on
+ * success.
+ */
+ inode = reiser4_iget(super, sdkey, 1, 1);
+ if (!IS_ERR(inode)) {
+ file_plugin *fplug;
+
+ fplug = inode_file_plugin(inode);
+ assert("nikita-3428", fplug != NULL);
+ assert("", oid == get_inode_oid(inode));
+ if (fplug->safelink != NULL) {
+ /* reiser4_txn_restart_current is not necessary because
+ * mounting is signle thread. However, without it
+ * deadlock detection code will complain (see
+ * nikita-3361). */
+ reiser4_txn_restart_current();
+ result = fplug->safelink(inode, link, size);
+ } else {
+ warning("nikita-3430",
+ "Cannot handle safelink for %lli",
+ (unsigned long long)oid);
+ reiser4_print_key("key", sdkey);
+ result = 0;
+ }
+ if (result != 0) {
+ warning("nikita-3431",
+ "Error processing safelink for %lli: %i",
+ (unsigned long long)oid, result);
+ }
+ reiser4_iget_complete(inode);
+ iput(inode);
+ if (result == 0) {
+ result = safe_link_grab(super, BA_CAN_COMMIT, subv);
+ if (result == 0)
+ result = safe_link_del(subv, oid, link);
+ safe_link_release(super);
+ /*
+ * restart transaction: if there was large number of
+ * safe-links, their processing may fail to fit into
+ * single transaction.
+ */
+ if (result == 0)
+ reiser4_txn_restart_current();
+ }
+ } else
+ result = PTR_ERR(inode);
+ return result;
+}
+
+/*
+ * iterate over all safe-links in the file-system processing them one by one.
+ */
+int process_safelinks(struct super_block *super, reiser4_subvol *subv)
+{
+ int result;
+ struct safe_link_context ctx;
+
+ if (sb_rdonly(super))
+ /* do nothing on the read-only file system */
+ return 0;
+ safe_link_iter_begin(subv, &ctx);
+ result = 0;
+ do {
+ result = safe_link_iter_next(subv, &ctx);
+ if (safe_link_iter_finished(subv, &ctx) || result == -ENOENT) {
+ result = 0;
+ break;
+ }
+ if (result == 0)
+ result = process_safelink(super, subv, ctx.link,
+ &ctx.sdkey, ctx.oid,
+ ctx.size);
+ } while (result == 0);
+ safe_link_iter_end(&ctx);
+ return result;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/safe_link.h linux-5.10.2/fs/reiser4/safe_link.h
--- linux-5.10.2.orig/fs/reiser4/safe_link.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/safe_link.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,28 @@
+/* Copyright 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Safe-links. See safe_link.c for details. */
+
+#if !defined(__FS_SAFE_LINK_H__)
+#define __FS_SAFE_LINK_H__
+
+#include "tree.h"
+int safe_link_grab(struct super_block *super,
+ reiser4_ba_flags_t flags, reiser4_subvol *subv);
+void safe_link_release(struct super_block *super);
+int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
+int safe_link_del(reiser4_subvol *, oid_t oid, reiser4_safe_link_t link);
+int process_safelinks(struct super_block *super, reiser4_subvol *subv);
+
+/* __FS_SAFE_LINK_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/seal.c linux-5.10.2/fs/reiser4/seal.c
--- linux-5.10.2.orig/fs/reiser4/seal.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/seal.c 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,223 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+/* Seals implementation. */
+/* Seals are "weak" tree pointers. They are analogous to tree coords in
+ allowing to bypass tree traversal. But normal usage of coords implies that
+ node pointed to by coord is locked, whereas seals don't keep a lock (or
+ even a reference) to znode. In stead, each znode contains a version number,
+ increased on each znode modification. This version number is copied into a
+ seal when seal is created. Later, one can "validate" seal by calling
+ reiser4_seal_validate(). If znode is in cache and its version number is
+ still the same, seal is "pristine" and coord associated with it can be
+ re-used immediately.
+
+ If, on the other hand, znode is out of cache, or it is obviously different
+ one from the znode seal was initially attached to (for example, it is on
+ the different level, or is being removed from the tree), seal is
+ irreparably invalid ("burned") and tree traversal has to be repeated.
+
+ Otherwise, there is some hope, that while znode was modified (and seal was
+ "broken" as a result), key attached to the seal is still in the node. This
+ is checked by first comparing this key with delimiting keys of node and, if
+ key is ok, doing intra-node lookup.
+
+ Znode version is maintained in the following way:
+
+ there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
+ znode_epoch is incremented and its new value is stored in ->version field
+ of new znode. Whenever znode is dirtied (which means it was probably
+ modified), znode_epoch is also incremented and its new value is stored in
+ znode->version. This is done so, because just incrementing znode->version
+ on each update is not enough: it may so happen, that znode get deleted, new
+ znode is allocated for the same disk block and gets the same version
+ counter, tricking seal code into false positive.
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "key.h"
+#include "coord.h"
+#include "seal.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "znode.h"
+#include "super.h"
+
+static znode *seal_node(const seal_t *seal, reiser4_tree *tree);
+static int seal_matches(const seal_t *seal, znode * node);
+
+/**
+ * Initialise seal.
+ * This can be called several times on the same seal.
+ * @coord: coord @seal will be attached to. Can be NULL.
+ * @key: key @seal will be attached to. Can be NULL.
+ */
+void reiser4_seal_init(seal_t *seal, const coord_t *coord,
+ const reiser4_key *key )
+{
+ assert("nikita-1886", seal != NULL);
+ memset(seal, 0, sizeof *seal);
+ if (coord != NULL) {
+ znode *node;
+
+ node = coord->node;
+ assert("nikita-1987", node != NULL);
+ spin_lock_znode(node);
+ seal->version = node->version;
+ assert("nikita-1988", seal->version != 0);
+ seal->block = *znode_get_block(node);
+#if REISER4_DEBUG
+ seal->coord1 = *coord;
+ if (key != NULL)
+ seal->key = *key;
+#endif
+ spin_unlock_znode(node);
+ }
+}
+
+/* finish with seal */
+void reiser4_seal_done(seal_t *seal/* seal to clear */)
+{
+ assert("nikita-1887", seal != NULL);
+ seal->version = 0;
+}
+
+/* true if seal was initialised */
+int reiser4_seal_is_set(const seal_t *seal/* seal to query */)
+{
+ assert("nikita-1890", seal != NULL);
+ return seal->version != 0;
+}
+
+#if REISER4_DEBUG
+/* helper function for reiser4_seal_validate(). It checks that item at @coord
+ * has expected key. This is to detect cases where node was modified but wasn't
+ * marked dirty. */
+static inline int check_seal_match(const coord_t *coord /* coord to check */ ,
+ const reiser4_key *k__/* expected key */)
+{
+ reiser4_key ukey;
+
+ /* FIXME-VS: we only can compare keys for items whose units
+ represent exactly one key */
+ if (coord->between != AT_UNIT)
+ return 1;
+ if (!coord_is_existing_unit(coord))
+ return 0;
+ if (item_is_extent(coord))
+ return 1;
+ if (item_is_ctail(coord))
+ return keyge(k__, unit_key_by_coord(coord, &ukey));
+ return keyeq(k__, unit_key_by_coord(coord, &ukey));
+}
+#endif
+
+/* this is used by reiser4_seal_validate. It accepts return value of
+ * longterm_lock_znode and returns 1 if it can be interpreted as seal
+ * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
+ * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
+ * We cannot do this in longterm_lock_znode(), because sometimes we want to
+ * distinguish between -EINVAL and -E_REPEAT. */
+static int should_repeat(int return_code)
+{
+ return return_code == -EINVAL;
+}
+
+/* (re-)validate seal.
+
+ Checks whether seal is pristine, and try to revalidate it if possible.
+
+ If seal was burned, or broken irreparably, return -E_REPEAT.
+
+ NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
+ looking for is in range of keys covered by the sealed node, but item wasn't
+ found by node ->lookup() method. Alternative is to return -ENOENT in this
+ case, but this would complicate callers logic.
+
+*/
+int reiser4_seal_validate(seal_t *seal /* seal to validate */,
+ reiser4_tree *tree,
+ coord_t *coord /* coord to validate against */,
+ const reiser4_key * key /* key to validate against */,
+ lock_handle * lh /* resulting lock handle */,
+ znode_lock_mode mode /* lock node */,
+ znode_lock_request request/* locking priority */)
+{
+ znode *node;
+ int result;
+
+ assert("nikita-1889", seal != NULL);
+ assert("nikita-1881", reiser4_seal_is_set(seal));
+ assert("nikita-1882", key != NULL);
+ assert("nikita-1883", coord != NULL);
+ assert("nikita-1884", lh != NULL);
+ assert("nikita-1885", all_but_ordering_keyeq(&seal->key, key));
+ assert("nikita-1989", coords_equal(&seal->coord1, coord));
+
+ /* obtain znode by block number */
+ node = seal_node(seal, tree);
+ if (!node)
+ /* znode wasn't in cache */
+ return RETERR(-E_REPEAT);
+ /* znode was in cache, lock it */
+ result = longterm_lock_znode(lh, node, mode, request);
+ zput(node);
+ if (result == 0) {
+ if (seal_matches(seal, node)) {
+ /* if seal version and znode version
+ coincide */
+ ON_DEBUG(coord_update_v(coord));
+ assert("nikita-1990",
+ node == seal->coord1.node);
+ assert("nikita-1898",
+ WITH_DATA_RET(coord->node, 1,
+ check_seal_match(coord,
+ key)));
+ } else
+ result = RETERR(-E_REPEAT);
+ }
+ if (result != 0) {
+ if (should_repeat(result))
+ result = RETERR(-E_REPEAT);
+ /* unlock node on failure */
+ done_lh(lh);
+ }
+ return result;
+}
+
+/* helpers functions */
+
+/* obtain reference to znode seal points to, if in cache */
+static znode *seal_node(const seal_t *seal, reiser4_tree *tree)
+{
+ assert("nikita-1891", seal != NULL);
+ assert("edward-1734", tree != NULL);
+
+ return zlook(tree, &seal->block);
+}
+
+/* true if @seal version and @node version coincide */
+static int seal_matches(const seal_t *seal /* seal to check */ ,
+ znode * node/* node to check */)
+{
+ int result;
+
+ assert("nikita-1991", seal != NULL);
+ assert("nikita-1993", node != NULL);
+
+ spin_lock_znode(node);
+ result = (seal->version == node->version);
+ spin_unlock_znode(node);
+ return result;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/seal.h linux-5.10.2/fs/reiser4/seal.h
--- linux-5.10.2.orig/fs/reiser4/seal.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/seal.h 2020-12-23 16:07:46.133813348 +0100
@@ -0,0 +1,49 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
+
+#ifndef __SEAL_H__
+#define __SEAL_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+
+/* for __u?? types */
+/*#include <linux/types.h>*/
+
+/* seal. See comment at the top of seal.c */
+typedef struct seal_s {
+ /* version of znode recorder at the time of seal creation */
+ __u64 version;
+ /* block number of znode attached to this seal */
+ reiser4_block_nr block;
+#if REISER4_DEBUG
+ /* coord this seal is attached to. For debugging. */
+ coord_t coord1;
+ /* key this seal is attached to. For debugging. */
+ reiser4_key key;
+#endif
+} seal_t;
+
+extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
+extern void reiser4_seal_done(seal_t *);
+extern int reiser4_seal_is_set(const seal_t *);
+extern int reiser4_seal_validate(seal_t *, reiser4_tree *, coord_t *,
+ const reiser4_key *, lock_handle * ,
+ znode_lock_mode mode, znode_lock_request request);
+
+/* __SEAL_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/search.c linux-5.10.2/fs/reiser4/search.c
--- linux-5.10.2.orig/fs/reiser4/search.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/search.c 2020-12-23 16:07:46.134813363 +0100
@@ -0,0 +1,1603 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "seal.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "plugin/plugin.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "tree.h"
+#include "reiser4.h"
+#include "super.h"
+#include "inode.h"
+
+#include <linux/slab.h>
+
+static const char *bias_name(lookup_bias bias);
+
+/* tree searching algorithm, intranode searching algorithms are in
+ plugin/node/ */
+
+/* tree lookup cache
+ *
+ * The coord by key cache consists of small list of recently accessed nodes
+ * maintained according to the LRU discipline. Before doing real top-to-down
+ * tree traversal this cache is scanned for nodes that can contain key
+ * requested.
+ *
+ * The efficiency of coord cache depends heavily on locality of reference for
+ * tree accesses. Our user level simulations show reasonably good hit ratios
+ * for coord cache under most loads so far.
+ */
+
+/* Initialise coord cache slot */
+static void cbk_cache_init_slot(cbk_cache_slot *slot)
+{
+ assert("nikita-345", slot != NULL);
+
+ INIT_LIST_HEAD(&slot->lru);
+ slot->node = NULL;
+}
+
+/* Initialize coord cache */
+int cbk_cache_init(cbk_cache * cache/* cache to init */)
+{
+ int i;
+
+ assert("nikita-346", cache != NULL);
+
+ cache->slot =
+ kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
+ reiser4_ctx_gfp_mask_get());
+ if (cache->slot == NULL)
+ return RETERR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cache->lru);
+ for (i = 0; i < cache->nr_slots; ++i) {
+ cbk_cache_init_slot(cache->slot + i);
+ list_add_tail(&((cache->slot + i)->lru), &cache->lru);
+ }
+ rwlock_init(&cache->guard);
+ return 0;
+}
+
+/* free cbk cache data */
+void cbk_cache_done(cbk_cache * cache/* cache to release */)
+{
+ assert("nikita-2493", cache != NULL);
+ if (cache->slot != NULL) {
+ kfree(cache->slot);
+ cache->slot = NULL;
+ }
+}
+
+/* macro to iterate over all cbk cache slots */
+#define for_all_slots(cache, slot) \
+ for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
+ &(cache)->lru != &(slot)->lru; \
+ (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
+
+#if REISER4_DEBUG
+/* this function assures that [cbk-cache-invariant] invariant holds */
+static int cbk_cache_invariant(const cbk_cache * cache)
+{
+ cbk_cache_slot *slot;
+ int result;
+ int unused;
+
+ if (cache->nr_slots == 0)
+ return 1;
+
+ assert("nikita-2469", cache != NULL);
+ unused = 0;
+ result = 1;
+ read_lock(&((cbk_cache *)cache)->guard);
+ for_all_slots(cache, slot) {
+ /* in LRU first go all `used' slots followed by `unused' */
+ if (unused && (slot->node != NULL))
+ result = 0;
+ if (slot->node == NULL)
+ unused = 1;
+ else {
+ cbk_cache_slot *scan;
+
+ /* all cached nodes are different */
+ scan = slot;
+ while (result) {
+ scan = list_entry(scan->lru.next,
+ cbk_cache_slot, lru);
+ if (&cache->lru == &scan->lru)
+ break;
+ if (slot->node == scan->node)
+ result = 0;
+ }
+ }
+ if (!result)
+ break;
+ }
+ read_unlock(&((cbk_cache *)cache)->guard);
+ return result;
+}
+
+#endif
+
+/* Remove references, if any, to @node from coord cache */
+void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
+ reiser4_tree * tree/* tree to remove node from */)
+{
+ cbk_cache_slot *slot;
+ cbk_cache *cache;
+ int i;
+
+ assert("nikita-350", node != NULL);
+ assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
+
+ cache = &tree->cbk_cache;
+ assert("nikita-2470", cbk_cache_invariant(cache));
+
+ write_lock(&(cache->guard));
+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
+ if (slot->node == node) {
+ list_move_tail(&slot->lru, &cache->lru);
+ slot->node = NULL;
+ break;
+ }
+ }
+ write_unlock(&(cache->guard));
+ assert("nikita-2471", cbk_cache_invariant(cache));
+}
+
+/* add to the cbk-cache in the "tree" information about "node". This
+ can actually be update of existing slot in a cache. */
+static void cbk_cache_add(const znode * node/* node to add to the cache */)
+{
+ cbk_cache *cache;
+
+ cbk_cache_slot *slot;
+ int i;
+
+ assert("nikita-352", node != NULL);
+
+ cache = &znode_get_tree(node)->cbk_cache;
+ assert("nikita-2472", cbk_cache_invariant(cache));
+
+ if (cache->nr_slots == 0)
+ return;
+
+ write_lock(&(cache->guard));
+ /* find slot to update/add */
+ for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
+ /* oops, this node is already in a cache */
+ if (slot->node == node)
+ break;
+ }
+ /* if all slots are used, reuse least recently used one */
+ if (i == cache->nr_slots) {
+ slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
+ slot->node = (znode *) node;
+ }
+ list_move(&slot->lru, &cache->lru);
+ write_unlock(&(cache->guard));
+ assert("nikita-2473", cbk_cache_invariant(cache));
+}
+
+static int setup_delimiting_keys(cbk_handle * h);
+static lookup_result coord_by_handle(cbk_handle * handle);
+static lookup_result traverse_tree(cbk_handle * h);
+static int cbk_cache_search(cbk_handle * h);
+
+static level_lookup_result cbk_level_lookup(cbk_handle * h);
+static level_lookup_result cbk_node_lookup(cbk_handle * h);
+
+/* helper functions */
+
+static void update_stale_dk(reiser4_tree * tree, znode * node);
+
+/* release parent node during traversal */
+static void put_parent(cbk_handle * h);
+/* check consistency of fields */
+static int sanity_check(cbk_handle * h);
+/* release resources in handle */
+static void hput(cbk_handle * h);
+
+static level_lookup_result search_to_left(cbk_handle * h);
+
+/* pack numerous (numberous I should say) arguments of coord_by_key() into
+ * cbk_handle */
+static cbk_handle *cbk_pack(cbk_handle * handle,
+ reiser4_tree * tree,
+ const reiser4_key * key,
+ coord_t *coord,
+ lock_handle * active_lh,
+ lock_handle * parent_lh,
+ znode_lock_mode lock_mode,
+ lookup_bias bias,
+ tree_level lock_level,
+ tree_level stop_level,
+ __u32 flags, ra_info_t *info)
+{
+ memset(handle, 0, sizeof *handle);
+
+ handle->tree = tree;
+ handle->key = key;
+ handle->lock_mode = lock_mode;
+ handle->bias = bias;
+ handle->lock_level = lock_level;
+ handle->stop_level = stop_level;
+ handle->coord = coord;
+ /* set flags. See comment in tree.h:cbk_flags */
+ handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
+
+ handle->active_lh = active_lh;
+ handle->parent_lh = parent_lh;
+ handle->ra_info = info;
+ return handle;
+}
+
+/* main tree lookup procedure
+
+ Check coord cache. If key we are looking for is not found there, call cbk()
+ to do real tree traversal.
+
+ As we have extents on the twig level, @lock_level and @stop_level can
+ be different from LEAF_LEVEL and each other.
+
+ Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
+ long term locks) while calling this.
+*/
+lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
+ * in. Usually this tree is
+ * part of file-system
+ * super-block */ ,
+ const reiser4_key * key /* key to look for */ ,
+ coord_t *coord /* where to store found
+ * position in a tree. Fields
+ * in "coord" are only valid if
+ * coord_by_key() returned
+ * "CBK_COORD_FOUND" */ ,
+ lock_handle * lh, /* resulting lock handle */
+ znode_lock_mode lock_mode /* type of lookup we
+ * want on node. Pass
+ * ZNODE_READ_LOCK here
+ * if you only want to
+ * read item found and
+ * ZNODE_WRITE_LOCK if
+ * you want to modify
+ * it */ ,
+ lookup_bias bias /* what to return if coord
+ * with exactly the @key is
+ * not in the tree */ ,
+ tree_level lock_level/* tree level where to start
+ * taking @lock type of
+ * locks */ ,
+ tree_level stop_level/* tree level to stop. Pass
+ * LEAF_LEVEL or TWIG_LEVEL
+ * here Item being looked
+ * for has to be between
+ * @lock_level and
+ * @stop_level, inclusive */ ,
+ __u32 flags /* search flags */ ,
+ ra_info_t *
+ info
+ /* information about desired tree traversal
+ * readahead */
+ )
+{
+ cbk_handle handle;
+ lock_handle parent_lh;
+ lookup_result result;
+
+ init_lh(lh);
+ init_lh(&parent_lh);
+
+ assert("nikita-3023", reiser4_schedulable());
+
+ assert("nikita-353", tree != NULL);
+ assert("nikita-354", key != NULL);
+ assert("nikita-355", coord != NULL);
+ assert("nikita-356", (bias == FIND_EXACT)
+ || (bias == FIND_MAX_NOT_MORE_THAN));
+ assert("nikita-357", stop_level >= LEAF_LEVEL);
+ /* no locks can be held during tree traversal */
+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
+
+ cbk_pack(&handle,
+ tree,
+ key,
+ coord,
+ lh,
+ &parent_lh,
+ lock_mode, bias, lock_level, stop_level, flags, info);
+
+ result = coord_by_handle(&handle);
+ assert("nikita-3247",
+ ergo(!IS_CBKERR(result), coord->node == lh->node));
+ return result;
+}
+
+/* like coord_by_key(), but starts traversal from vroot of @object rather than
+ * from tree root. */
+lookup_result reiser4_object_lookup(reiser4_tree *tree,
+ struct inode *object,
+ const reiser4_key * key,
+ coord_t *coord,
+ lock_handle * lh,
+ znode_lock_mode lock_mode,
+ lookup_bias bias,
+ tree_level lock_level,
+ tree_level stop_level, __u32 flags,
+ ra_info_t *info)
+{
+ cbk_handle handle;
+ lock_handle parent_lh;
+ lookup_result result;
+
+ init_lh(lh);
+ init_lh(&parent_lh);
+
+ assert("nikita-3023", reiser4_schedulable());
+
+ assert("nikita-354", key != NULL);
+ assert("nikita-355", coord != NULL);
+ assert("nikita-356", (bias == FIND_EXACT)
+ || (bias == FIND_MAX_NOT_MORE_THAN));
+ assert("nikita-357", stop_level >= LEAF_LEVEL);
+ /* no locks can be held during tree search by key */
+ assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
+
+ cbk_pack(&handle,
+ tree,
+ key,
+ coord,
+ lh,
+ &parent_lh,
+ lock_mode, bias, lock_level, stop_level, flags, info);
+ handle.object = object;
+
+ result = coord_by_handle(&handle);
+ assert("nikita-3247",
+ ergo(!IS_CBKERR(result), coord->node == lh->node));
+ return result;
+}
+
+/* lookup by cbk_handle. Common part of coord_by_key() and
+ reiser4_object_lookup(). */
+static lookup_result coord_by_handle(cbk_handle * handle)
+{
+ /*
+ * first check cbk_cache (which is look-aside cache for our tree) and
+ * of this fails, start traversal.
+ */
+ /* first check whether "key" is in cache of recent lookups. */
+ if (cbk_cache_search(handle) == 0)
+ return handle->result;
+ else
+ return traverse_tree(handle);
+}
+
+/* Execute actor for each item (or unit, depending on @through_units_p),
+ starting from @coord, right-ward, until either:
+
+ - end of the tree is reached
+ - unformatted node is met
+ - error occurred
+ - @actor returns 0 or less
+
+ Error code, or last actor return value is returned.
+
+ This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
+ sequence of entries with identical keys and alikes.
+*/
+int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
+ coord_t *coord /* coord to start from */ ,
+ lock_handle * lh /* lock handle to start with and to
+ * update along the way */ ,
+ tree_iterate_actor_t actor /* function to call on each
+ * item/unit */ ,
+ void *arg /* argument to pass to @actor */ ,
+ znode_lock_mode mode /* lock mode on scanned nodes */ ,
+ int through_units_p /* call @actor on each item or on
+ * each unit */ )
+{
+ int result;
+
+ assert("nikita-1143", tree != NULL);
+ assert("nikita-1145", coord != NULL);
+ assert("nikita-1146", lh != NULL);
+ assert("nikita-1147", actor != NULL);
+
+ result = zload(coord->node);
+ coord_clear_iplug(coord);
+ if (result != 0)
+ return result;
+ if (!coord_is_existing_unit(coord)) {
+ zrelse(coord->node);
+ return -ENOENT;
+ }
+ while ((result = actor(tree, coord, lh, arg)) > 0) {
+ /* move further */
+ if ((through_units_p && coord_next_unit(coord)) ||
+ (!through_units_p && coord_next_item(coord))) {
+ do {
+ lock_handle couple;
+
+ /* move to the next node */
+ init_lh(&couple);
+ result =
+ reiser4_get_right_neighbor(&couple,
+ coord->node,
+ (int)mode,
+ GN_CAN_USE_UPPER_LEVELS);
+ zrelse(coord->node);
+ if (result == 0) {
+
+ result = zload(couple.node);
+ if (result != 0) {
+ done_lh(&couple);
+ return result;
+ }
+
+ coord_init_first_unit(coord,
+ couple.node);
+ done_lh(lh);
+ move_lh(lh, &couple);
+ } else
+ return result;
+ } while (node_is_empty(coord->node));
+ }
+
+ assert("nikita-1149", coord_is_existing_unit(coord));
+ }
+ zrelse(coord->node);
+ return result;
+}
+
+/**
+ * Return locked uber znode for @tree
+ */
+int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
+ znode_lock_request pri, lock_handle * lh)
+{
+ int result;
+
+ result = longterm_lock_znode(lh, tree->uber, mode, pri);
+ return result;
+}
+
+/* true if @key is strictly within @node
+
+ we are looking for possibly non-unique key and it is item is at the edge of
+ @node. May be it is in the neighbor.
+*/
+static int znode_contains_key_strict(znode * node /* node to check key
+ * against */ ,
+ const reiser4_key *
+ key /* key to check */ ,
+ int isunique)
+{
+ int answer;
+
+ assert("nikita-1760", node != NULL);
+ assert("nikita-1722", key != NULL);
+
+ if (keyge(key, &node->rd_key))
+ return 0;
+
+ answer = keycmp(&node->ld_key, key);
+
+ if (isunique)
+ return answer != GREATER_THAN;
+ else
+ return answer == LESS_THAN;
+}
+
+/*
+ * Virtual Root (vroot) code.
+ *
+ * For given file system object (e.g., regular file or directory) let's
+ * define its "virtual root" as lowest in the tree (that is, furtherest
+ * from the tree root) node such that all body items of said object are
+ * located in a tree rooted at this node.
+ *
+ * Once vroot of object is found all tree lookups for items within body of
+ * this object ("object lookups") can be started from its vroot rather
+ * than from real root. This has following advantages:
+ *
+ * 1. amount of nodes traversed during lookup (and, hence, amount of
+ * key comparisons made) decreases, and
+ *
+ * 2. contention on tree root is decreased. This latter was actually
+ * motivating reason behind vroot, because spin lock of root node,
+ * which is taken when acquiring long-term lock on root node is the
+ * hottest lock in the reiser4.
+ *
+ * How to find vroot.
+ *
+ * When vroot of object F is not yet determined, all object lookups start
+ * from the root of the tree. At each tree level during traversal we have
+ * a node N such that a key we are looking for (which is the key inside
+ * object's body) is located within N. In function handle_vroot() called
+ * from cbk_level_lookup() we check whether N is possible vroot for
+ * F. Check is trivial---if neither leftmost nor rightmost item of N
+ * belongs to F (and we already have helpful ->owns_item() method of
+ * object plugin for this), then N is possible vroot of F. This, of
+ * course, relies on the assumption that each object occupies contiguous
+ * range of keys in the tree.
+ *
+ * Thus, traversing tree downward and checking each node as we go, we can
+ * find lowest such node, which, by definition, is vroot.
+ *
+ * How to track vroot.
+ *
+ * Nohow. If actual vroot changes, next object lookup will just restart
+ * from the actual tree root, refreshing object's vroot along the way.
+ *
+ */
+
+/*
+ * Check whether @node is possible vroot of @object.
+ */
+static void handle_vroot(struct inode *object, znode * node)
+{
+ file_plugin *fplug;
+ coord_t coord;
+
+ fplug = inode_file_plugin(object);
+ assert("nikita-3353", fplug != NULL);
+ assert("nikita-3354", fplug->owns_item != NULL);
+
+ if (unlikely(node_is_empty(node)))
+ return;
+
+ coord_init_first_unit(&coord, node);
+ /*
+ * if leftmost item of @node belongs to @object, we cannot be sure
+ * that @node is vroot of @object, because, some items of @object are
+ * probably in the sub-tree rooted at the left neighbor of @node.
+ */
+ if (fplug->owns_item(object, &coord))
+ return;
+ coord_init_last_unit(&coord, node);
+ /* mutatis mutandis for the rightmost item */
+ if (fplug->owns_item(object, &coord))
+ return;
+ /* otherwise, @node is possible vroot of @object */
+ inode_set_vroot(object, node);
+}
+
+/*
+ * helper function used by traverse tree to start tree traversal not from the
+ * tree root, but from @h->object's vroot, if possible.
+ */
+static int prepare_object_lookup(cbk_handle * h)
+{
+ znode *vroot;
+ int result;
+
+ vroot = inode_get_vroot(h->object);
+ if (vroot == NULL) {
+ /*
+ * object doesn't have known vroot, start from real tree root.
+ */
+ return LOOKUP_CONT;
+ }
+
+ h->level = znode_get_level(vroot);
+ /* take a long-term lock on vroot */
+ h->result = longterm_lock_znode(h->active_lh, vroot,
+ cbk_lock_mode(h->level, h),
+ ZNODE_LOCK_LOPRI);
+ result = LOOKUP_REST;
+ if (h->result == 0) {
+ int isunique;
+ int inside;
+
+ isunique = h->flags & CBK_UNIQUE;
+ /* check that key is inside vroot */
+ read_lock_dk(h->tree);
+ inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
+ !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
+ read_unlock_dk(h->tree);
+ if (inside) {
+ h->result = zload(vroot);
+ if (h->result == 0) {
+ /* search for key in vroot. */
+ result = cbk_node_lookup(h);
+ zrelse(vroot); /*h->active_lh->node); */
+ if (h->active_lh->node != vroot) {
+ result = LOOKUP_REST;
+ } else if (result == LOOKUP_CONT) {
+ move_lh(h->parent_lh, h->active_lh);
+ h->flags &= ~CBK_DKSET;
+ }
+ }
+ }
+ }
+
+ zput(vroot);
+
+ if (IS_CBKERR(h->result) || result == LOOKUP_REST)
+ hput(h);
+ return result;
+}
+
+/* main function that handles common parts of tree traversal: starting
+ (fake znode handling), restarts, error handling, completion */
+static lookup_result traverse_tree(cbk_handle * h/* search handle */)
+{
+ int done;
+ int iterations;
+ int vroot_used;
+
+ assert("nikita-365", h != NULL);
+ assert("nikita-366", h->tree != NULL);
+ assert("nikita-367", h->key != NULL);
+ assert("nikita-368", h->coord != NULL);
+ assert("nikita-369", (h->bias == FIND_EXACT)
+ || (h->bias == FIND_MAX_NOT_MORE_THAN));
+ assert("nikita-370", h->stop_level >= LEAF_LEVEL);
+ assert("nikita-2949", !(h->flags & CBK_DKSET));
+ assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
+
+ done = 0;
+ iterations = 0;
+ vroot_used = 0;
+
+ /* loop for restarts */
+restart:
+
+ assert("nikita-3024", reiser4_schedulable());
+
+ h->result = CBK_COORD_FOUND;
+ /* connect_znode() needs it */
+ h->ld_key = *reiser4_min_key();
+ h->rd_key = *reiser4_max_key();
+ h->flags |= CBK_DKSET;
+ h->error = NULL;
+
+ if (!vroot_used && h->object != NULL) {
+ vroot_used = 1;
+ done = prepare_object_lookup(h);
+ if (done == LOOKUP_REST)
+ goto restart;
+ else if (done == LOOKUP_DONE)
+ return h->result;
+ }
+ if (h->parent_lh->node == NULL) {
+ done =
+ get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
+ h->parent_lh);
+
+ assert("nikita-1637", done != -E_DEADLOCK);
+
+ h->block = h->tree->root_block;
+ h->level = h->tree->height;
+ h->coord->node = h->parent_lh->node;
+
+ if (done != 0)
+ return done;
+ }
+
+ /* loop descending a tree */
+ while (!done) {
+
+ if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
+ IS_POW(iterations))) {
+ warning("nikita-1481", "Too many iterations: %i",
+ iterations);
+ reiser4_print_key("key", h->key);
+ ++iterations;
+ } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
+ h->error =
+ "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
+ h->result = RETERR(-EIO);
+ break;
+ }
+ switch (cbk_level_lookup(h)) {
+ case LOOKUP_CONT:
+ move_lh(h->parent_lh, h->active_lh);
+ continue;
+ default:
+ wrong_return_value("nikita-372", "cbk_level");
+ case LOOKUP_DONE:
+ done = 1;
+ break;
+ case LOOKUP_REST:
+ hput(h);
+ /* deadlock avoidance is normal case. */
+ if (h->result != -E_DEADLOCK)
+ ++iterations;
+ reiser4_preempt_point();
+ goto restart;
+ }
+ }
+ /* that's all. The rest is error handling */
+ if (unlikely(h->error != NULL)) {
+ warning("nikita-373", "%s: level: %i, "
+ "lock_level: %i, stop_level: %i "
+ "lock_mode: %s, bias: %s",
+ h->error, h->level, h->lock_level, h->stop_level,
+ lock_mode_name(h->lock_mode), bias_name(h->bias));
+ reiser4_print_address("block", &h->block);
+ reiser4_print_key("key", h->key);
+ print_coord_content("coord", h->coord);
+ }
+ /* `unlikely' error case */
+ if (unlikely(IS_CBKERR(h->result))) {
+ /* failure. do cleanup */
+ hput(h);
+ } else {
+ assert("nikita-1605", WITH_DATA_RET
+ (h->coord->node, 1,
+ ergo((h->result == CBK_COORD_FOUND) &&
+ (h->bias == FIND_EXACT) &&
+ (!node_is_empty(h->coord->node)),
+ coord_is_existing_item(h->coord))));
+ }
+ return h->result;
+}
+
+/* find delimiting keys of child
+
+ Determine left and right delimiting keys for child pointed to by
+ @parent_coord.
+
+*/
+static void find_child_delimiting_keys(znode * parent /* parent znode, passed
+ * locked */ ,
+ const coord_t *parent_coord
+ /* coord where pointer
+ * to child is stored
+ */ ,
+ reiser4_key * ld /* where to store left
+ * delimiting key */ ,
+ reiser4_key * rd /* where to store right
+ * delimiting key */ )
+{
+ coord_t neighbor;
+
+ assert("nikita-1484", parent != NULL);
+ assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
+
+ coord_dup(&neighbor, parent_coord);
+
+ if (neighbor.between == AT_UNIT)
+ /* imitate item ->lookup() behavior. */
+ neighbor.between = AFTER_UNIT;
+
+ if (coord_set_to_left(&neighbor) == 0)
+ unit_key_by_coord(&neighbor, ld);
+ else {
+ assert("nikita-14851", 0);
+ *ld = *znode_get_ld_key(parent);
+ }
+
+ coord_dup(&neighbor, parent_coord);
+ if (neighbor.between == AT_UNIT)
+ neighbor.between = AFTER_UNIT;
+ if (coord_set_to_right(&neighbor) == 0)
+ unit_key_by_coord(&neighbor, rd);
+ else
+ *rd = *znode_get_rd_key(parent);
+}
+
+/*
+ * setup delimiting keys for a child
+ *
+ * @parent parent node
+ *
+ * @coord location in @parent where pointer to @child is
+ *
+ * @child child node
+ */
+int
+set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child)
+{
+ reiser4_tree *tree;
+
+ assert("nikita-2952",
+ znode_get_level(parent) == znode_get_level(coord->node));
+
+ /* fast check without taking dk lock. This is safe, because
+ * JNODE_DKSET is never cleared once set. */
+ if (!ZF_ISSET(child, JNODE_DKSET)) {
+ tree = znode_get_tree(parent);
+ write_lock_dk(tree);
+ if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
+ find_child_delimiting_keys(parent, coord,
+ &child->ld_key,
+ &child->rd_key);
+ ON_DEBUG(child->ld_key_version =
+ atomic_inc_return(&delim_key_version);
+ child->rd_key_version =
+ atomic_inc_return(&delim_key_version););
+ ZF_SET(child, JNODE_DKSET);
+ }
+ write_unlock_dk(tree);
+ return 1;
+ }
+ return 0;
+}
+
+/* Perform tree lookup at one level. This is called from cbk_traverse()
+ function that drives lookup through tree and calls cbk_node_lookup() to
+ perform lookup within one node.
+
+ See comments in a code.
+*/
+static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */)
+{
+ int ret;
+ int setdk;
+ int ldkeyset = 0;
+ reiser4_key ldkey;
+ reiser4_key key;
+ znode *active;
+
+ assert("nikita-3025", reiser4_schedulable());
+
+ /* acquire reference to @active node */
+ active = zget(h->tree->subvol, &h->block, h->parent_lh->node, h->level,
+ reiser4_ctx_gfp_mask_get());
+
+ if (IS_ERR(active)) {
+ h->result = PTR_ERR(active);
+ return LOOKUP_DONE;
+ }
+
+ /* lock @active */
+ h->result = longterm_lock_znode(h->active_lh,
+ active,
+ cbk_lock_mode(h->level, h),
+ ZNODE_LOCK_LOPRI);
+ /* longterm_lock_znode() acquires additional reference to znode (which
+ will be later released by longterm_unlock_znode()). Release
+ reference acquired by zget().
+ */
+ zput(active);
+ if (unlikely(h->result != 0))
+ goto fail_or_restart;
+
+ setdk = 0;
+ /* if @active is accessed for the first time, setup delimiting keys on
+ it. Delimiting keys are taken from the parent node. See
+ setup_delimiting_keys() for details.
+ */
+ if (h->flags & CBK_DKSET) {
+ setdk = setup_delimiting_keys(h);
+ h->flags &= ~CBK_DKSET;
+ } else {
+ znode *parent;
+
+ parent = h->parent_lh->node;
+ h->result = zload(parent);
+ if (unlikely(h->result != 0))
+ goto fail_or_restart;
+
+ if (!ZF_ISSET(active, JNODE_DKSET))
+ setdk = set_child_delimiting_keys(parent,
+ h->coord, active);
+ else {
+ read_lock_dk(h->tree);
+ find_child_delimiting_keys(parent, h->coord, &ldkey,
+ &key);
+ read_unlock_dk(h->tree);
+ ldkeyset = 1;
+ }
+ zrelse(parent);
+ }
+
+ /* this is ugly kludge. Reminder: this is necessary, because
+ ->lookup() method returns coord with ->between field probably set
+ to something different from AT_UNIT.
+ */
+ h->coord->between = AT_UNIT;
+
+ if (znode_just_created(active) && (h->coord->node != NULL)) {
+ write_lock_tree();
+ /* if we are going to load znode right now, setup
+ ->in_parent: coord where pointer to this node is stored in
+ parent.
+ */
+ coord_to_parent_coord(h->coord, &active->in_parent);
+ write_unlock_tree();
+ }
+
+ /* check connectedness without holding tree lock---false negatives
+ * will be re-checked by connect_znode(), and false positives are
+ * impossible---@active cannot suddenly turn into unconnected
+ * state. */
+ if (!znode_is_connected(active)) {
+ h->result = connect_znode(h->coord, active);
+ if (unlikely(h->result != 0)) {
+ put_parent(h);
+ goto fail_or_restart;
+ }
+ }
+
+ jload_prefetch(ZJNODE(active));
+
+ if (setdk)
+ update_stale_dk(h->tree, active);
+
+ /* put_parent() cannot be called earlier, because connect_znode()
+ assumes parent node is referenced; */
+ put_parent(h);
+
+ if ((!znode_contains_key_lock(active, h->key) &&
+ (h->flags & CBK_TRUST_DK))
+ || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
+ /* 1. key was moved out of this node while this thread was
+ waiting for the lock. Restart. More elaborate solution is
+ to determine where key moved (to the left, or to the right)
+ and try to follow it through sibling pointers.
+
+ 2. or, node itself is going to be removed from the
+ tree. Release lock and restart.
+ */
+ h->result = -E_REPEAT;
+ }
+ if (h->result == -E_REPEAT)
+ return LOOKUP_REST;
+
+ h->result = zload_ra(active, h->ra_info);
+ if (h->result)
+ return LOOKUP_DONE;
+
+ /* sanity checks */
+ if (sanity_check(h)) {
+ zrelse(active);
+ return LOOKUP_DONE;
+ }
+
+ /* check that key of leftmost item in the @active is the same as in
+ * its parent */
+ if (ldkeyset && !node_is_empty(active) &&
+ !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
+ warning("vs-3533", "Keys are inconsistent. Fsck?");
+ reiser4_print_key("inparent", &ldkey);
+ reiser4_print_key("inchild", &key);
+ h->result = RETERR(-EIO);
+ zrelse(active);
+ return LOOKUP_DONE;
+ }
+
+ if (h->object != NULL)
+ handle_vroot(h->object, active);
+
+ ret = cbk_node_lookup(h);
+
+ /* h->active_lh->node might change, but active is yet to be zrelsed */
+ zrelse(active);
+
+ return ret;
+
+fail_or_restart:
+ if (h->result == -E_DEADLOCK)
+ return LOOKUP_REST;
+ return LOOKUP_DONE;
+}
+
+#if REISER4_DEBUG
+/* check left and right delimiting keys of a znode */
+void check_dkeys(znode * node)
+{
+ znode *left;
+ znode *right;
+ reiser4_tree *tree = znode_get_tree(node);
+
+ read_lock_tree();
+ read_lock_dk(tree);
+
+ assert("vs-1710", znode_is_any_locked(node));
+ assert("vs-1197",
+ !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
+
+ left = node->left;
+ right = node->right;
+
+ if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
+ && left != NULL && ZF_ISSET(left, JNODE_DKSET))
+ /* check left neighbor. Note that left neighbor is not locked,
+ so it might get wrong delimiting keys therefore */
+ assert("vs-1198",
+ (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
+ || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
+
+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
+ && right != NULL && ZF_ISSET(right, JNODE_DKSET))
+ /* check right neighbor. Note that right neighbor is not
+ locked, so it might get wrong delimiting keys therefore */
+ assert("vs-1199",
+ (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
+ || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
+
+ read_unlock_dk(tree);
+ read_unlock_tree();
+}
+#endif
+
+/* true if @key is left delimiting key of @node */
+static int key_is_ld(znode * node, const reiser4_key * key)
+{
+ int ld;
+
+ assert("nikita-1716", node != NULL);
+ assert("nikita-1758", key != NULL);
+
+ read_lock_dk(znode_get_tree(node));
+ assert("nikita-1759", znode_contains_key(node, key));
+ ld = keyeq(znode_get_ld_key(node), key);
+ read_unlock_dk(znode_get_tree(node));
+ return ld;
+}
+
+/* Process one node during tree traversal.
+
+ This is called by cbk_level_lookup(). */
+static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */)
+{
+ /* node plugin of @active */
+ node_plugin *nplug;
+ /* item plugin of item that was found */
+ item_plugin *iplug;
+ /* search bias */
+ lookup_bias node_bias;
+ /* node we are operating upon */
+ znode *active;
+ /* tree we are searching in */
+ reiser4_tree *tree;
+ /* result */
+ int result;
+
+ assert("nikita-379", h != NULL);
+
+ active = h->active_lh->node;
+ tree = h->tree;
+
+ nplug = active->nplug;
+ assert("nikita-380", nplug != NULL);
+
+ //ON_DEBUG(check_dkeys(active));
+
+ /* return item from "active" node with maximal key not greater than
+ "key" */
+ node_bias = h->bias;
+ result = nplug->lookup(active, h->key, node_bias, h->coord);
+ if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
+ /* error occurred */
+ h->result = result;
+ return LOOKUP_DONE;
+ }
+ if (h->level == h->stop_level) {
+ /* welcome to the stop level */
+ assert("nikita-381", h->coord->node == active);
+ if (result == NS_FOUND) {
+ /* success of tree lookup */
+ if (!(h->flags & CBK_UNIQUE)
+ && key_is_ld(active, h->key))
+ return search_to_left(h);
+ else
+ h->result = CBK_COORD_FOUND;
+ } else {
+ h->result = CBK_COORD_NOTFOUND;
+ }
+ if (!(h->flags & CBK_IN_CACHE))
+ cbk_cache_add(active);
+ return LOOKUP_DONE;
+ }
+
+ if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
+ h->error = "not found on internal node";
+ h->result = result;
+ return LOOKUP_DONE;
+ }
+
+ assert("vs-361", h->level > h->stop_level);
+
+ if (handle_eottl(h, &result)) {
+ assert("vs-1674", (result == LOOKUP_DONE ||
+ result == LOOKUP_REST));
+ return result;
+ }
+
+ /* go down to next level */
+ check_me("vs-12", zload(h->coord->node) == 0);
+ assert("nikita-2116", item_is_internal(h->coord));
+ iplug = item_plugin_by_coord(h->coord);
+ iplug->s.internal.down_link(h->coord, h->key, &h->block);
+ zrelse(h->coord->node);
+ --h->level;
+ return LOOKUP_CONT; /* continue */
+}
+
+/* scan cbk_cache slots looking for a match for @h */
+static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */)
+{
+ level_lookup_result llr;
+ znode *node;
+ reiser4_tree *tree;
+ cbk_cache_slot *slot;
+ cbk_cache *cache;
+ tree_level level;
+ int isunique;
+ const reiser4_key *key;
+ int result;
+
+ assert("nikita-1317", h != NULL);
+ assert("nikita-1315", h->tree != NULL);
+ assert("nikita-1316", h->key != NULL);
+
+ tree = h->tree;
+ cache = &tree->cbk_cache;
+ if (cache->nr_slots == 0)
+ /* size of cbk cache was set to 0 by mount time option. */
+ return RETERR(-ENOENT);
+
+ assert("nikita-2474", cbk_cache_invariant(cache));
+ node = NULL; /* to keep gcc happy */
+ level = h->level;
+ key = h->key;
+ isunique = h->flags & CBK_UNIQUE;
+ result = RETERR(-ENOENT);
+
+ /*
+ * this is time-critical function and dragons had, hence, been settled
+ * here.
+ *
+ * Loop below scans cbk cache slots trying to find matching node with
+ * suitable range of delimiting keys and located at the h->level.
+ *
+ * Scan is done under cbk cache spin lock that protects slot->node
+ * pointers. If suitable node is found we want to pin it in
+ * memory. But slot->node can point to the node with x_count 0
+ * (unreferenced). Such node can be recycled at any moment, or can
+ * already be in the process of being recycled (within jput()).
+ *
+ * As we found node in the cbk cache, it means that jput() hasn't yet
+ * called cbk_cache_invalidate().
+ *
+ * We acquire reference to the node without holding tree lock, and
+ * later, check node's RIP bit. This avoids races with jput().
+ */
+
+ rcu_read_lock();
+ read_lock(&((cbk_cache *)cache)->guard);
+
+ slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
+ slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
+ BUG_ON(&slot->lru != &cache->lru);/*????*/
+ while (1) {
+
+ slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
+
+ if (&cache->lru != &slot->lru)
+ node = slot->node;
+ else
+ node = NULL;
+
+ if (unlikely(node == NULL))
+ break;
+
+ /*
+ * this is (hopefully) the only place in the code where we are
+ * working with delimiting keys without holding dk lock. This
+ * is fine here, because this is only "guess" anyway---keys
+ * are rechecked under dk lock below.
+ */
+ if (znode_get_level(node) == level &&
+ /* reiser4_min_key < key < reiser4_max_key */
+ znode_contains_key_strict(node, key, isunique)) {
+ zref(node);
+ result = 0;
+ spin_lock_prefetch(&get_current_super_private()->tree_lock);
+ break;
+ }
+ }
+ read_unlock(&((cbk_cache *)cache)->guard);
+
+ assert("nikita-2475", cbk_cache_invariant(cache));
+
+ if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
+ result = -ENOENT;
+
+ rcu_read_unlock();
+
+ if (result != 0) {
+ h->result = CBK_COORD_NOTFOUND;
+ return RETERR(-ENOENT);
+ }
+
+ result =
+ longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
+ ZNODE_LOCK_LOPRI);
+ zput(node);
+ if (result != 0)
+ return result;
+ result = zload(node);
+ if (result != 0)
+ return result;
+
+ /* recheck keys */
+ read_lock_dk(tree);
+ result = (znode_contains_key_strict(node, key, isunique) &&
+ !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
+ read_unlock_dk(tree);
+ if (result) {
+ /* do lookup inside node */
+ llr = cbk_node_lookup(h);
+ /* if cbk_node_lookup() wandered to another node (due to eottl
+ or non-unique keys), adjust @node */
+ /*node = h->active_lh->node; */
+
+ if (llr != LOOKUP_DONE) {
+ /* restart or continue on the next level */
+ result = RETERR(-ENOENT);
+ } else if (IS_CBKERR(h->result))
+ /* io or oom */
+ result = RETERR(-ENOENT);
+ else {
+ /* good. Either item found or definitely not found. */
+ result = 0;
+
+ write_lock(&(cache->guard));
+ if (slot->node == h->active_lh->node) {
+ /* if this node is still in cbk cache---move
+ its slot to the head of the LRU list. */
+ list_move(&slot->lru, &cache->lru);
+ }
+ write_unlock(&(cache->guard));
+ }
+ } else {
+ /* race. While this thread was waiting for the lock, node was
+ rebalanced and item we are looking for, shifted out of it
+ (if it ever was here).
+
+ Continuing scanning is almost hopeless: node key range was
+ moved to, is almost certainly at the beginning of the LRU
+ list at this time, because it's hot, but restarting
+ scanning from the very beginning is complex. Just return,
+ so that cbk() will be performed. This is not that
+ important, because such races should be rare. Are they?
+ */
+ result = RETERR(-ENOENT); /* -ERAUGHT */
+ }
+ zrelse(node);
+ assert("nikita-2476", cbk_cache_invariant(cache));
+ return result;
+}
+
+/* look for item with given key in the coord cache
+
+ This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
+ which is a small LRU list of znodes accessed lately. For each znode in
+ znode in this list, it checks whether key we are looking for fits into key
+ range covered by this node. If so, and in addition, node lies at allowed
+ level (this is to handle extents on a twig level), node is locked, and
+ lookup inside it is performed.
+
+ we need a measurement of the cost of this cache search compared to the cost
+ of coord_by_key.
+
+*/
+static int cbk_cache_search(cbk_handle * h/* cbk handle */)
+{
+ int result = 0;
+ tree_level level;
+
+ /* add CBK_IN_CACHE to the handle flags. This means that
+ * cbk_node_lookup() assumes that cbk_cache is scanned and would add
+ * found node to the cache. */
+ h->flags |= CBK_IN_CACHE;
+ for (level = h->stop_level; level <= h->lock_level; ++level) {
+ h->level = level;
+ result = cbk_cache_scan_slots(h);
+ if (result != 0) {
+ done_lh(h->active_lh);
+ done_lh(h->parent_lh);
+ } else {
+ assert("nikita-1319", !IS_CBKERR(h->result));
+ break;
+ }
+ }
+ h->flags &= ~CBK_IN_CACHE;
+ return result;
+}
+
+/* type of lock we want to obtain during tree traversal. On stop level
+ we want type of lock user asked for, on upper levels: read lock. */
+znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
+{
+ assert("nikita-382", h != NULL);
+
+ return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
+}
+
+/* update outdated delimiting keys */
+static void stale_dk(reiser4_tree * tree, znode * node)
+{
+ znode *right;
+
+ read_lock_tree();
+ write_lock_dk(tree);
+ right = node->right;
+
+ if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
+ right && ZF_ISSET(right, JNODE_DKSET) &&
+ !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
+ znode_set_rd_key(node, znode_get_ld_key(right));
+
+ write_unlock_dk(tree);
+ read_unlock_tree();
+}
+
+/* check for possibly outdated delimiting keys, and update them if
+ * necessary. */
+static void update_stale_dk(reiser4_tree * tree, znode * node)
+{
+ znode *right;
+ reiser4_key rd;
+
+ read_lock_tree();
+ read_lock_dk(tree);
+ rd = *znode_get_rd_key(node);
+ right = node->right;
+ if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
+ right && ZF_ISSET(right, JNODE_DKSET) &&
+ !keyeq(&rd, znode_get_ld_key(right)))) {
+ assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
+ read_unlock_dk(tree);
+ read_unlock_tree();
+ stale_dk(tree, node);
+ return;
+ }
+ read_unlock_dk(tree);
+ read_unlock_tree();
+}
+
+/*
+ * handle searches a the non-unique key.
+ *
+ * Suppose that we are looking for an item with possibly non-unique key 100.
+ *
+ * Root node contains two pointers: one to a node with left delimiting key 0,
+ * and another to a node with left delimiting key 100. Item we interested in
+ * may well happen in the sub-tree rooted at the first pointer.
+ *
+ * To handle this search_to_left() is called when search reaches stop
+ * level. This function checks it is _possible_ that item we are looking for
+ * is in the left neighbor (this can be done by comparing delimiting keys) and
+ * if so, tries to lock left neighbor (this is low priority lock, so it can
+ * deadlock, tree traversal is just restarted if it did) and then checks
+ * whether left neighbor actually contains items with our key.
+ *
+ * Note that this is done on the stop level only. It is possible to try such
+ * left-check on each level, but as duplicate keys are supposed to be rare
+ * (very unlikely that more than one node is completely filled with items with
+ * duplicate keys), it sis cheaper to scan to the left on the stop level once.
+ *
+ */
+static level_lookup_result search_to_left(cbk_handle * h/* search handle */)
+{
+ level_lookup_result result;
+ coord_t *coord;
+ znode *node;
+ znode *neighbor;
+
+ lock_handle lh;
+
+ assert("nikita-1761", h != NULL);
+ assert("nikita-1762", h->level == h->stop_level);
+
+ init_lh(&lh);
+ coord = h->coord;
+ node = h->active_lh->node;
+ assert("nikita-1763", coord_is_leftmost_unit(coord));
+
+ h->result =
+ reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
+ GN_CAN_USE_UPPER_LEVELS);
+ neighbor = NULL;
+ switch (h->result) {
+ case -E_DEADLOCK:
+ result = LOOKUP_REST;
+ break;
+ case 0:{
+ node_plugin *nplug;
+ coord_t crd;
+ lookup_bias bias;
+
+ neighbor = lh.node;
+ h->result = zload(neighbor);
+ if (h->result != 0) {
+ result = LOOKUP_DONE;
+ break;
+ }
+
+ nplug = neighbor->nplug;
+
+ coord_init_zero(&crd);
+ bias = h->bias;
+ h->bias = FIND_EXACT;
+ h->result =
+ nplug->lookup(neighbor, h->key, h->bias, &crd);
+ h->bias = bias;
+
+ if (h->result == NS_NOT_FOUND) {
+ case -E_NO_NEIGHBOR:
+ h->result = CBK_COORD_FOUND;
+ if (!(h->flags & CBK_IN_CACHE))
+ cbk_cache_add(node);
+ default: /* some other error */
+ result = LOOKUP_DONE;
+ } else if (h->result == NS_FOUND) {
+ read_lock_dk(znode_get_tree(neighbor));
+ h->rd_key = *znode_get_ld_key(node);
+ leftmost_key_in_node(neighbor, &h->ld_key);
+ read_unlock_dk(znode_get_tree(neighbor));
+ h->flags |= CBK_DKSET;
+
+ h->block = *znode_get_block(neighbor);
+ /* clear coord->node so that cbk_level_lookup()
+ wouldn't overwrite parent hint in neighbor.
+
+ Parent hint was set up by
+ reiser4_get_left_neighbor()
+ */
+ /* FIXME: why do we have to spinlock here? */
+ write_lock_tree();
+ h->coord->node = NULL;
+ write_unlock_tree();
+ result = LOOKUP_CONT;
+ } else {
+ result = LOOKUP_DONE;
+ }
+ if (neighbor != NULL)
+ zrelse(neighbor);
+ }
+ }
+ done_lh(&lh);
+ return result;
+}
+
+/* debugging aid: return symbolic name of search bias */
+static const char *bias_name(lookup_bias bias/* bias to get name of */)
+{
+ if (bias == FIND_EXACT)
+ return "exact";
+ else if (bias == FIND_MAX_NOT_MORE_THAN)
+ return "left-slant";
+/* else if( bias == RIGHT_SLANT_BIAS ) */
+/* return "right-bias"; */
+ else {
+ static char buf[30];
+
+ sprintf(buf, "unknown: %i", bias);
+ return buf;
+ }
+}
+
+#if REISER4_DEBUG
+/* debugging aid: print human readable information about @p */
+void print_coord_content(const char *prefix /* prefix to print */ ,
+ coord_t *p/* coord to print */)
+{
+ reiser4_key key;
+
+ if (p == NULL) {
+ printk("%s: null\n", prefix);
+ return;
+ }
+ if ((p->node != NULL) && znode_is_loaded(p->node)
+ && coord_is_existing_item(p))
+ printk("%s: data: %p, length: %i\n", prefix,
+ item_body_by_coord(p), item_length_by_coord(p));
+ if (znode_is_loaded(p->node)) {
+ item_key_by_coord(p, &key);
+ reiser4_print_key(prefix, &key);
+ }
+}
+
+/* debugging aid: print human readable information about @block */
+void reiser4_print_address(const char *prefix /* prefix to print */ ,
+ const reiser4_block_nr * block/* block number to print */)
+{
+ printk("%s: %s\n", prefix, sprint_address(block));
+}
+#endif
+
+/* return string containing human readable representation of @block */
+char *sprint_address(const reiser4_block_nr *
+ block/* block number to print */)
+{
+ static char address[30];
+
+ if (block == NULL)
+ sprintf(address, "null");
+ else if (reiser4_blocknr_is_fake(block))
+ sprintf(address, "%llx", (unsigned long long)(*block));
+ else
+ sprintf(address, "%llu", (unsigned long long)(*block));
+ return address;
+}
+
+/* release parent node during traversal */
+static void put_parent(cbk_handle * h/* search handle */)
+{
+ assert("nikita-383", h != NULL);
+ if (h->parent_lh->node != NULL)
+ longterm_unlock_znode(h->parent_lh);
+}
+
+/* helper function used by coord_by_key(): release reference to parent znode
+ stored in handle before processing its child. */
+static void hput(cbk_handle * h/* search handle */)
+{
+ assert("nikita-385", h != NULL);
+ done_lh(h->parent_lh);
+ done_lh(h->active_lh);
+}
+
+/* Helper function used by cbk(): update delimiting keys of child node (stored
+ in h->active_lh->node) using key taken from parent on the parent level. */
+static int setup_delimiting_keys(cbk_handle * h/* search handle */)
+{
+ znode *active;
+ reiser4_tree *tree;
+
+ assert("nikita-1088", h != NULL);
+
+ active = h->active_lh->node;
+
+ /* fast check without taking dk lock. This is safe, because
+ * JNODE_DKSET is never cleared once set. */
+ if (!ZF_ISSET(active, JNODE_DKSET)) {
+ tree = znode_get_tree(active);
+ write_lock_dk(tree);
+ if (!ZF_ISSET(active, JNODE_DKSET)) {
+ znode_set_ld_key(active, &h->ld_key);
+ znode_set_rd_key(active, &h->rd_key);
+ ZF_SET(active, JNODE_DKSET);
+ }
+ write_unlock_dk(tree);
+ return 1;
+ }
+ return 0;
+}
+
+/* check consistency of fields */
+static int sanity_check(cbk_handle * h/* search handle */)
+{
+ assert("nikita-384", h != NULL);
+
+ if (h->level < h->stop_level) {
+ h->error = "Buried under leaves";
+ h->result = RETERR(-EIO);
+ return LOOKUP_DONE;
+ } else if (!reiser4_subvol_blocknr_is_sane(h->tree->subvol,
+ &h->block)) {
+ h->error = "bad block number";
+ h->result = RETERR(-EIO);
+ return LOOKUP_DONE;
+ } else
+ return 0;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/status_flags.c linux-5.10.2/fs/reiser4/status_flags.c
--- linux-5.10.2.orig/fs/reiser4/status_flags.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/status_flags.c 2020-12-23 16:07:46.134813363 +0100
@@ -0,0 +1,180 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Functions that deal with reiser4 status block, query status and update it,
+ * if needed */
+
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include "debug.h"
+#include "dformat.h"
+#include "status_flags.h"
+#include "super.h"
+
+/* This is our end I/O handler that marks page uptodate if IO was successful.
+ It also unconditionally unlocks the page, so we can see that io was done.
+ We do not free bio, because we hope to reuse that. */
+static void reiser4_status_endio(struct bio *bio)
+{
+ if (!bio->bi_status)
+ SetPageUptodate(bio->bi_io_vec->bv_page);
+ else {
+ ClearPageUptodate(bio->bi_io_vec->bv_page);
+ SetPageError(bio->bi_io_vec->bv_page);
+ }
+ unlock_page(bio->bi_io_vec->bv_page);
+}
+
+/* Initialise status code. This is expected to be called from the disk format
+ code. block paremeter is where status block lives. */
+int reiser4_status_init(reiser4_subvol *subv, reiser4_block_nr block)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+ struct reiser4_status *statuspage;
+ struct bio *bio;
+ struct page *page;
+
+ subv->status_page = NULL;
+ subv->status_bio = NULL;
+
+ page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
+ if (!page)
+ return -ENOMEM;
+
+ bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
+ if (bio != NULL) {
+ bio->bi_iter.bi_sector = block * (sb->s_blocksize >> 9);
+ bio_set_dev(bio, subv->bdev);
+ bio->bi_io_vec[0].bv_page = page;
+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
+ bio->bi_io_vec[0].bv_offset = 0;
+ bio->bi_vcnt = 1;
+ bio->bi_iter.bi_size = sb->s_blocksize;
+ bio->bi_end_io = reiser4_status_endio;
+ } else {
+ __free_pages(page, 0);
+ return -ENOMEM;
+ }
+ lock_page(page);
+ bio_set_op_attrs(bio, READ, 0);
+ submit_bio(bio);
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ warning("green-2007",
+ "I/O error while tried to read status page\n");
+ return -EIO;
+ }
+
+ statuspage = (struct reiser4_status *)kmap_atomic(page);
+ if (memcmp
+ (statuspage->magic, REISER4_STATUS_MAGIC,
+ sizeof(REISER4_STATUS_MAGIC))) {
+ /* Magic does not match. */
+ kunmap_atomic((char *)statuspage);
+ warning("green-2008", "Wrong magic in status block\n");
+ __free_pages(page, 0);
+ bio_put(bio);
+ return -EINVAL;
+ }
+ kunmap_atomic((char *)statuspage);
+
+ subv->status_page = page;
+ subv->status_bio = bio;
+ return 0;
+}
+
+/**
+ * Query the status of fs. Returns if the FS can be safely mounted.
+ * Also if "status" and "extended" parameters are given, it will fill
+ * actual parts of status from disk there
+ */
+int reiser4_status_query(reiser4_subvol *subv, u64 *status, u64 *extended)
+{
+ struct reiser4_status *statuspage;
+ int retval;
+
+ if (!subv->status_page)
+ /* No status page? */
+ return REISER4_STATUS_MOUNT_UNKNOWN;
+
+ statuspage = (struct reiser4_status *)kmap_atomic(subv->status_page);
+
+ switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {
+ /* FIXME: this cast is a hack for 32 bit arches to work. */
+ case REISER4_STATUS_OK:
+ retval = REISER4_STATUS_MOUNT_OK;
+ break;
+ case REISER4_STATUS_CORRUPTED:
+ retval = REISER4_STATUS_MOUNT_WARN;
+ break;
+ case REISER4_STATUS_DAMAGED:
+ case REISER4_STATUS_DESTROYED:
+ case REISER4_STATUS_IOERROR:
+ retval = REISER4_STATUS_MOUNT_RO;
+ break;
+ default:
+ retval = REISER4_STATUS_MOUNT_UNKNOWN;
+ break;
+ }
+
+ if (status)
+ *status = le64_to_cpu(get_unaligned(&statuspage->status));
+ if (extended)
+ *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
+
+ kunmap_atomic((char *)statuspage);
+ return retval;
+}
+
+/* This function should be called when something bad happens (e.g. from
+ reiser4_panic). It fills the status structure and tries to push it to disk.*/
+int reiser4_status_write(reiser4_subvol *subv,
+ __u64 status, __u64 extended_status, char *message)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+ struct reiser4_status *statuspage;
+ struct bio *bio = subv->status_bio;
+
+ if (!subv->status_page)
+ /* No status page? */
+ return -1;
+
+ statuspage = (struct reiser4_status *)kmap_atomic(subv->status_page);
+
+ put_unaligned(cpu_to_le64(status), &statuspage->status);
+ put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
+ strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
+
+ kunmap_atomic((char *)statuspage);
+ bio_reset(bio);
+ bio_set_dev(bio, subv->bdev);
+ bio->bi_io_vec[0].bv_page = subv->status_page;
+ bio->bi_io_vec[0].bv_len = sb->s_blocksize;
+ bio->bi_io_vec[0].bv_offset = 0;
+ bio->bi_vcnt = 1;
+ bio->bi_iter.bi_size = sb->s_blocksize;
+ bio->bi_end_io = reiser4_status_endio;
+ lock_page(subv->status_page); /* Safe as nobody should touch our page */
+ /*
+ * We can block now, but we have no other choice anyway
+ */
+ bio_set_op_attrs(bio, WRITE, 0);
+ submit_bio(bio);
+ /*
+ * We do not wait for IO completon
+ */
+ return 0;
+}
+
+/* Frees the page with status and bio structure. Should be called by disk format
+ * at umount time */
+int reiser4_status_finish(reiser4_subvol *subv)
+{
+ __free_pages(subv->status_page, 0);
+ subv->status_page = NULL;
+ bio_put(subv->status_bio);
+ subv->status_bio = NULL;
+ return 0;
+}
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/status_flags.h linux-5.10.2/fs/reiser4/status_flags.h
--- linux-5.10.2.orig/fs/reiser4/status_flags.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/status_flags.h 2020-12-23 16:07:46.134813363 +0100
@@ -0,0 +1,53 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Here we declare structures and flags that store reiser4 status on disk.
+ The status that helps us to find out if the filesystem is valid or if it
+ contains some critical, or not so critical errors */
+
+#if !defined(__REISER4_STATUS_FLAGS_H__)
+#define __REISER4_STATUS_FLAGS_H__
+
+#include "dformat.h"
+/* These are major status flags */
+#define REISER4_STATUS_OK 0
+#define REISER4_STATUS_CORRUPTED 0x1
+#define REISER4_STATUS_DAMAGED 0x2
+#define REISER4_STATUS_DESTROYED 0x4
+#define REISER4_STATUS_IOERROR 0x8
+
+/* These are extended status flags */
+
+#define REISER4_ESTATUS_OK 0
+#define REISER4_ESTATUS_MIRRORS_NOT_SYNCED 0x1
+
+/* Return values for reiser4_status_query() */
+#define REISER4_STATUS_MOUNT_OK 0
+#define REISER4_STATUS_MOUNT_WARN 1
+#define REISER4_STATUS_MOUNT_RO 2
+#define REISER4_STATUS_MOUNT_UNKNOWN -1
+
+#define REISER4_TEXTERROR_LEN 256
+
+#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
+/* We probably need to keep its size under sector size which is 512 bytes */
+struct reiser4_status {
+ char magic[16];
+ d64 status; /* Current FS state */
+ d64 extended_status; /* Any additional info that might have sense in
+ * addition to "status". E.g. last sector where
+ * io error happened if status is
+ * "io error encountered" */
+ d64 stacktrace[10]; /* Last ten functional calls made (addresses) */
+ char texterror[REISER4_TEXTERROR_LEN]; /* Any error message if
+ * appropriate, otherwise filled
+ * with zeroes */
+};
+
+int reiser4_status_init(reiser4_subvol *subv, reiser4_block_nr block);
+int reiser4_status_query(reiser4_subvol *subv, u64 *status, u64 *extended);
+int reiser4_status_write(reiser4_subvol *subv, u64 status,
+ u64 extended_status, char *message);
+int reiser4_status_finish(reiser4_subvol *subv);
+
+#endif
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/super.c linux-5.10.2/fs/reiser4/super.c
--- linux-5.10.2.orig/fs/reiser4/super.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/super.c 2020-12-23 16:07:46.134813363 +0100
@@ -0,0 +1,361 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Super-block manipulations. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "plugin/security/perm.h"
+#include "plugin/space/space_allocator.h"
+#include "plugin/plugin.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block */
+
+static __u64 reserved_for_gid(const struct super_block *sb, gid_t gid);
+static __u64 reserved_for_uid(const struct super_block *sb, uid_t uid);
+static __u64 reserved_for_root(const struct super_block *subv);
+
+/* Return reiser4-specific part of super block */
+reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super)
+{
+ return (reiser4_super_info_data *) super->s_fs_info;
+}
+
+/* Return reiser4 fstype: value that is returned in ->f_type field by statfs()
+ */
+long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
+{
+ assert("nikita-448", super != NULL);
+ assert("nikita-449", is_reiser4_super(super));
+ return (long)REISER4_SUPER_MAGIC;
+}
+
+/* functions to read/modify fields of reiser4_super_info_data */
+
+/* get number of blocks in subvolume */
+__u64 reiser4_subvol_block_count(const reiser4_subvol *subv)
+{
+ assert("vs-494", subv != NULL);
+ return subv->block_count;
+}
+
+/**
+ * Scan mslots and collect statistics from each subvolume of a logical volume
+ */
+u64 reiser4_collect_super_stat(const struct super_block *sb,
+ u64 (*subvol_get_stat)(const reiser4_subvol *))
+{
+ u64 slot;
+ u64 result = 0;
+
+ for (slot = 0;; slot++) {
+ u64 cnt;
+ lv_conf *conf;
+ reiser4_subvol *subv;
+
+ rcu_read_lock();
+ conf = super_conf(sb);
+
+ if (slot >= conf->nr_mslots) {
+ rcu_read_unlock();
+ break;
+ }
+ if (!conf_mslot_at(conf, slot)) {
+ rcu_read_unlock();
+ continue;
+ }
+ subv = conf_origin(conf, slot);
+ assert("edward-2272", subv != NULL);
+
+ cnt = subvol_get_stat(subv);
+ rcu_read_unlock();
+
+ result += cnt;
+ }
+ return result;
+}
+
+/* get number of blocks in logical volume */
+__u64 reiser4_volume_block_count(const struct super_block *super)
+{
+ return reiser4_collect_super_stat(super,
+ reiser4_subvol_block_count);
+}
+
+/*
+ * Set number of blocks and reserved space for a logical volume.
+ * Pre-condition: @nr is total number of blocks of all its subvolumes.
+ */
+void reiser4_subvol_set_block_count(reiser4_subvol *subv, __u64 nr)
+{
+ assert("vs-501", subv != NULL);
+
+ subv->block_count = nr;
+ /*
+ * The proper calculation of the reserved space counter (%5 of device
+ * block counter) we need a 64 bit division which is missing in Linux
+ * on i386 platform. Because we do not need a precise calculation here
+ * we can replace a div64 operation by this combination of
+ * multiplication and shift: 51. / (2^10) == .0498 .
+ * FIXME: this is a bug. It comes up only for very small filesystems
+ * which probably are never used. Nevertheless, it is a bug. Number of
+ * reserved blocks must be not less than maximal number of blocks which
+ * get grabbed with BA_RESERVED.
+ */
+ subv->blocks_reserved = ((nr * 51) >> 10);
+}
+
+__u64 reiser4_subvol_blocks_reserved(const reiser4_subvol *subv)
+{
+ return subv->blocks_reserved;
+}
+
+__u64 reiser4_volume_blocks_reserved(const struct super_block *super)
+{
+ return reiser4_collect_super_stat(super,
+ reiser4_subvol_blocks_reserved);
+}
+
+/* amount of blocks used (allocated for data or meta-data) in subvolume */
+__u64 reiser4_subvol_used_blocks(const reiser4_subvol *subv)
+{
+ assert("nikita-452", subv != NULL);
+ return subv->blocks_used;
+}
+
+/* set number of blocks used */
+void reiser4_subvol_set_used_blocks(reiser4_subvol *subv, __u64 nr)
+{
+ assert("vs-503", subv != NULL);
+ subv->blocks_used = nr;
+}
+
+__u64 reiser4_subvol_min_blocks_used(const reiser4_subvol *subv)
+{
+ assert("edward-2332", subv != NULL);
+ return subv->min_blocks_used;
+}
+
+void reiser4_subvol_set_min_blocks_used(reiser4_subvol *subv, __u64 nr)
+{
+ assert("edward-2333", subv != NULL);
+ subv->min_blocks_used = nr;
+}
+
+/* amount of free blocks in subvolume */
+__u64 reiser4_subvol_free_blocks(const reiser4_subvol *subv)
+{
+ assert("nikita-454", subv != NULL);
+ return subv->blocks_free;
+}
+
+/* set number of free blocks */
+void reiser4_subvol_set_free_blocks(reiser4_subvol *subv, __u64 nr)
+{
+ assert("vs-505", subv != NULL);
+ subv->blocks_free = nr;
+}
+
+__u64 reiser4_subvol_data_capacity(reiser4_subvol *subv)
+{
+ assert("edward-1796", subv != NULL);
+ assert("edward-1839",
+ subv->data_capacity <= reiser4_subvol_block_count(subv));
+ return subv->data_capacity;
+}
+
+void reiser4_subvol_set_data_capacity(reiser4_subvol *subv, __u64 value)
+{
+ assert("edward-1797", subv != NULL);
+ subv->data_capacity = value;
+}
+
+/* amount of free blocks in logical volume */
+__u64 reiser4_volume_free_blocks(const struct super_block *super)
+{
+ return reiser4_collect_super_stat(super,
+ reiser4_subvol_free_blocks);
+}
+
+/* get mkfs unique identifier */
+__u32 reiser4_mkfs_id(const struct super_block *super, __u32 subv_id)
+{
+ assert("vpf-221", super != NULL);
+ return super_origin(super, subv_id)->mkfs_id;
+}
+
+/* amount of free blocks */
+__u64 reiser4_subvol_free_committed_blocks(const reiser4_subvol *subv)
+{
+ return subv->blocks_free_committed;
+}
+
+/**
+ * amount of blocks reserved for @uid and @gid in a volume
+ */
+long reiser4_volume_reserved4user(const struct super_block *sb,
+ uid_t uid, /* user id */
+ gid_t gid /* group id */)
+{
+ long reserved = 0;
+
+ assert("nikita-456", sb != NULL);
+
+ if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
+ reserved += reserved_for_gid(sb, gid);
+ if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
+ reserved += reserved_for_uid(sb, uid);
+ if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
+ reserved += reserved_for_root(sb);
+ return reserved;
+}
+
+/* get/set value of/to grabbed blocks counter */
+__u64 reiser4_subvol_grabbed_blocks(const reiser4_subvol *subv)
+{
+ assert("zam-512", subv != NULL);
+
+ return subv->blocks_grabbed;
+}
+
+__u64 reiser4_subvol_flush_reserved(const reiser4_subvol *subv)
+{
+ assert("vpf-285", subv != NULL);
+
+ return subv->blocks_flush_reserved;
+}
+
+/* get/set value of/to counter of fake allocated formatted blocks */
+__u64 reiser4_subvol_fake_allocated_fmt(const reiser4_subvol *subv)
+{
+ assert("zam-516", subv != NULL);
+
+ return subv->blocks_fake_allocated;
+}
+
+/* get/set value of/to counter of fake allocated unformatted blocks */
+__u64 reiser4_subvol_fake_allocated_unf(const reiser4_subvol *subv)
+{
+ assert("zam-516", subv != NULL);
+
+ return subv->blocks_fake_allocated_unformatted;
+}
+
+/* get/set value of/to counter of clustered blocks */
+__u64 reiser4_subvol_clustered_blocks(const reiser4_subvol *subv)
+{
+ assert("edward-601", subv != NULL);
+
+ return subv->blocks_clustered;
+}
+
+/* space allocator used by this subvolume */
+reiser4_space_allocator *reiser4_get_space_allocator(reiser4_subvol *subv)
+{
+ assert("edward-1800", subv != NULL);
+ return &subv->space_allocator;
+}
+
+/* return fake inode used to bind formatted nodes in the page cache */
+struct inode *reiser4_get_super_fake(const struct super_block *super)
+{
+ assert("nikita-1757", super != NULL);
+ return get_super_private(super)->fake;
+}
+
+/* return fake inode used to bind copied on capture nodes in the page cache */
+struct inode *reiser4_get_cc_fake(const struct super_block *super)
+{
+ assert("nikita-1757", super != NULL);
+ return get_super_private(super)->cc;
+}
+
+/* return fake inode used to bind bitmaps and journlal heads */
+struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
+{
+ assert("nikita-17571", super != NULL);
+ return get_super_private(super)->bitmap;
+}
+
+/* Check that @super is (looks like) reiser4 super block. This is mainly for
+ use in assertions. */
+int is_reiser4_super(const struct super_block *super)
+{
+ return super != NULL &&
+ get_super_private(super) != NULL &&
+ super->s_op == &(get_super_private(super)->ops.super);
+}
+
+/**
+ * amount of blocks reserved for given group in file system
+ */
+static __u64 reserved_for_gid(const struct super_block *sb, gid_t gid)
+{
+ return 0;
+}
+
+/**
+ * amount of blocks reserved for given user in file system
+ */
+static __u64 reserved_for_uid(const struct super_block *sb, uid_t uid)
+{
+ return 0;
+}
+
+/**
+ * amount of blocks reserved for super user in file system
+ */
+static __u64 reserved_for_root(const struct super_block *sb)
+{
+ return 0;
+}
+
+/**
+ * true if block number @blk makes sense for the file system at @subv.
+ */
+int reiser4_subvol_blocknr_is_sane(const reiser4_subvol *subv,
+ const reiser4_block_nr *blk)
+{
+ assert("nikita-2957", subv != NULL);
+ assert("nikita-2958", blk != NULL);
+
+ if (reiser4_blocknr_is_fake(blk))
+ return 1;
+ return *blk < reiser4_subvol_block_count(subv);
+}
+
+#if REISER4_DEBUG
+static u64 reiser4_subvol_fake_allocated(const reiser4_subvol *subv)
+{
+ return reiser4_subvol_fake_allocated_fmt(subv) +
+ reiser4_subvol_fake_allocated_unf(subv);
+}
+
+u64 reiser4_volume_fake_allocated(const struct super_block *sb)
+{
+ u64 ret;
+ spin_lock_reiser4_super(get_super_private(sb));
+ ret = reiser4_collect_super_stat(sb,
+ reiser4_subvol_fake_allocated);
+ spin_unlock_reiser4_super(get_super_private(sb));
+ return ret;
+}
+#endif
+
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/super.h linux-5.10.2/fs/reiser4/super.h
--- linux-5.10.2.orig/fs/reiser4/super.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/super.h 2020-12-23 16:07:46.134813363 +0100
@@ -0,0 +1,936 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Super-block functions. See super.c for details. */
+
+#if !defined(__REISER4_SUPER_H__)
+#define __REISER4_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#include "ioctl.h"
+#include "tree.h"
+#include "entd.h"
+#include "wander.h"
+#include "fsdata.h"
+#include "plugin/object.h"
+#include "plugin/dst/dst.h"
+#include "plugin/space/space_allocator.h"
+
+/*
+ * Flush algorithms parameters.
+ */
+struct flush_params {
+ unsigned relocate_threshold;
+ unsigned relocate_distance;
+ unsigned written_threshold;
+ unsigned scan_maxnodes;
+};
+
+/*
+ * VFS related operation vectors.
+ */
+struct object_ops {
+ struct super_operations super;
+ struct dentry_operations dentry;
+ struct export_operations export;
+};
+
+/* reiser4-specific part of super block
+
+ Locking
+
+ Fields immutable after mount:
+
+ ->oid*
+ ->space*
+ ->default_[ug]id
+ ->mkfs_id
+ ->trace_flags
+ ->debug_flags
+ ->fs_flags
+ ->df_plug
+ ->optimal_io_size
+ ->plug
+ ->flush
+ ->u (bad name)
+ ->txnmgr
+ ->ra_params
+ ->journal_header
+ ->journal_footer
+
+ Fields protected by per-super block spin lock
+
+ ->block_count
+ ->blocks_used
+ ->blocks_free
+ ->blocks_free_committed
+ ->blocks_grabbed
+ ->blocks_fake_allocated_unformatted
+ ->blocks_fake_allocated
+ ->blocks_flush_reserved
+ ->eflushed
+ ->blocknr_hint_default
+
+ After journal replaying during mount,
+
+ ->last_committed_tx
+
+ is protected by ->tmgr.commit_mutex
+
+ Invariants involving this data-type:
+
+ [sb-block-counts]
+ [sb-grabbed]
+ [sb-fake-allocated]
+*/
+
+/**
+ * Per-atom and per-subvolume commit info.
+ * This structure is accessed at atom commit time under commit_mutex.
+ * See also definition of per-logical-volume struct commit_handle.
+ */
+struct commit_handle_subvol
+{
+ struct list_head overwrite_set;
+ __u32 overwrite_set_size;
+ struct list_head tx_list; /* jnodes for wander record blocks */
+ __u32 tx_size; /* number of wander records for this subvolume */
+ struct list_head wander_map; /* The atom's wandered_block mapping.
+ * Earlier it was ->wandered_map of struct
+ * txn_atom. Edward moved it here, as
+ * wandered map is always constructed at
+ * commit time under commit_mutex, so
+ * actually there is nothing to do for this
+ * map in the struct txn_atom.
+ */
+ reiser4_block_nr nr_bitmap; /* counter of modified bitmaps */
+ u64 free_blocks; /*'committed' sb counters are saved here until
+ atom is completely flushed */
+};
+
+/*
+ * In-memory subvolume header.
+ * It is always associated with a physical or logical (built with LVM,
+ * etc means) block device.
+ */
+struct reiser4_subvol {
+ struct list_head list; /* all registered subvolumes are linked */
+ u8 uuid[16]; /* external ID */
+ char *name;
+ fmode_t mode;
+ struct block_device *bdev;
+ u64 id; /* internal ID (index in the array of slots) */
+ int mirror_id; /* index in the array of mirrors (0 indicates origin) */
+ u32 dsa_idx; /* index in Data Storage Array (DSA). That index is set
+ by ->create_buckets() operation */
+ int num_replicas; /* number of replicas, (mirrors excluding original) */
+ u64 data_capacity; /* "weight" of the brick in data storage array */
+ u64 apx_len;
+ void *apx;
+ reiser4_block_nr volmap_loc[2]; /* location of first voltab blocks for
+ current and new volume configs */
+ unsigned long flags; /* subvolume-wide flags, see subvol_flags enum */
+ disk_format_plugin *df_plug; /* disk format of this subvolume */
+ jnode *sb_jnode;
+ reiser4_block_nr loc_super; /* location of the format super-block */
+ reiser4_space_allocator space_allocator; /* space manager plugin */
+ reiser4_txmod_id txmod; /* transaction model for this subvolume */
+ struct flush_params flush; /* parameters for the flush algorithm */
+ reiser4_tree tree; /* internal tree */
+ __u32 mkfs_id; /* mkfs identifier generated at mkfs time. */
+
+ __u64 block_count; /* amount of blocks in a subvolume */
+ __u64 blocks_free; /* amount of free blocks. This is a "working" version
+ of free blocks counter. It is like "working"
+ bitmap, see block_alloc.c for description */
+ __u64 blocks_reserved; /* inviolable reserve */
+ __u64 blocks_used; /* amount of blocks used by file system data and
+ meta-data. */
+ __u64 blocks_grabbed; /* number of blocks reserved for further
+ allocation, for all threads */
+ __u64 blocks_fake_allocated_unformatted;/* number of fake allocated
+ unformatted blocks in tree */
+ __u64 blocks_fake_allocated; /* number of fake allocated formatted
+ blocks in tree */
+ __u64 blocks_flush_reserved; /* number of blocks reserved for flush
+ operations */
+ __u64 blocks_clustered; /* number of blocks reserved for cluster
+ operations */
+
+ int version; /* On-disk format version. May be upgraded at mount time */
+ jnode *journal_header; /* jnode of hournal header */
+ jnode *journal_footer; /* jnode of journal footer */
+ journal_location jloc;
+ __u64 last_committed_tx; /* head block number of last committed
+ transaction */
+ __u64 blocknr_hint_default; /* we remember last written location
+ for using as a hint for new block
+ allocation */
+ struct repacker *repacker;
+ struct page *status_page; /* Image of the status block */
+ struct bio *status_bio;
+
+ __u64 min_blocks_used; /* minimum used blocks value (includes super
+ blocks, bitmap blocks and other fs reserved
+ areas), depends on fs format and fs size. */
+ /*
+ * Per-subvolume fields of commit handle.
+ * Access to them requires to acquire the commit_mutex.
+ */
+ __u64 blocks_freed; /* number of blocks freed by the actor
+ apply_dset_to_commit_bmap */
+ __u64 blocks_free_committed; /* "commit" version of free
+ block counter */
+ struct commit_handle_subvol ch;
+ struct super_block *super; /* associated super-block */
+};
+
+static inline int subvol_is_set(const reiser4_subvol *subv,
+ reiser4_subvol_flag f)
+{
+ return test_bit((int)f, &subv->flags);
+}
+
+/*
+ * In-memory superblock
+ */
+struct reiser4_super_info_data {
+ spinlock_t guard; /* protects fields blocks_free,
+ blocks_free_committed, etc */
+ oid_t next_to_use;/* next oid that will be returned by oid_allocate() */
+ oid_t oids_in_use; /* total number of used oids */
+ __u32 default_uid; /* default user id used for light-weight files
+ without their own stat-data */
+ __u32 default_gid; /* default group id used for light-weight files
+ without their own stat-data */
+ unsigned long fs_flags; /* file-system wide flags. See reiser4_fs_flag
+ enum */
+ txn_mgr tmgr; /* transaction manager */
+ entd_context entd; /* ent thread */
+ struct inode *fake; /* fake inode used to bind formatted nodes */
+ /* inode used to bind bitmaps (and journal heads) */
+ struct inode *bitmap; /* fake inode used to bind bitmaps (and journal
+ heads) */
+ struct inode *cc; /* fake inode used to bind copied on capture nodes */
+ unsigned long optimal_io_size; /* value we return in st_blksize on
+ stat(2) */
+ __u64 nr_files_committed; /* committed number of files (oid allocator
+ state variable ) */
+ __u64 vol_block_count; /* amount of blocks in a (logical) volume */
+ struct formatted_ra_params ra_params;
+ int onerror; /* What to do in case of IO error. Specified by a mount
+ option */
+ struct object_ops ops; /* operations for objects on this volume */
+ struct d_cursor_info d_info; /* structure to maintain d_cursors.
+ See plugin/file_ops_readdir.c for more
+ details */
+ struct crypto_shash *csum_tfm;
+ j_hash_table jhash_table; /* hash table to look up jnodes by inode
+ and offset. */
+ rwlock_t tree_lock; /* lock protecting:
+ - parent pointers;
+ - sibling pointers;
+ - znode hash table;
+ - coord cache.
+ NOTE: The "giant" tree lock can be replaced by
+ more spin locks, hoping they will be less
+ contented. We can use one spin lock per one
+ znode hash bucket. With adding of some code
+ complexity, sibling pointers can be protected by
+ both znode spin locks. However it looks more SMP
+ scalable we should test this locking change on
+ n-ways (n > 4) SMP machines. Current 4-ways
+ machine test does not show that tree lock is
+ contented and it is a bottleneck (2003.07.25)
+ */
+ struct mutex delete_mutex;/* a mutex for serializing cut tree operation
+ if out-of-free-space: the only one cut_tree
+ thread is allowed to grab space from
+ reserved area (it is 5% of disk space) */
+ struct task_struct *delete_mutex_owner; /* task owning ->delete_mutex */
+#ifdef CONFIG_REISER4_BADBLOCKS
+ unsigned long altsuper; /* Alternative master superblock offset
+ (in bytes). Specified by a mount option */
+#endif
+ struct dentry *debugfs_root;
+#if REISER4_DEBUG
+ /*
+ * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
+ * are kept on a list anchored at sbinfo->all_jnodes. This list is
+ * protected by sbinfo->all_guard spin lock. This lock should be taken
+ * with _irq modifier, because it is also modified from interrupt
+ * contexts (by RCU).
+ */
+ spinlock_t all_guard;
+ struct list_head all_jnodes; /* list of all jnodes */
+#endif
+ struct reiser4_volume *vol; /* accociated volume header */
+ reiser4_context *ctx;
+};
+
+static inline struct reiser4_super_info_data *sbinfo_by_vol(struct reiser4_volume *vol)
+{
+ return container_of(&vol, struct reiser4_super_info_data, vol);
+}
+
+/*
+ * On-disk volume configuraion
+ */
+struct reiser4_volinfo {
+ jnode **volmap_nodes;
+ int num_volmaps;
+ jnode **voltab_nodes;
+ int num_voltabs;
+};
+
+#define CUR_VOL_CONF 0
+#define NEW_VOL_CONF 1
+
+/*
+ * In-memory volume configuration
+ */
+struct lv_conf {
+ void *tab; /* distribution config */
+ u64 nr_mslots; /* number of columns in the table of activated
+ * subvolumes. Each column represents a set of
+ * mirrors (see the picture below) */
+ slot_t mslots[0]; /* pointer to a table of activated subvolumes,
+ * where:
+ * mslots[i] : array of mirrors at the i-th slot;
+ * mslots[i][j]: j-th mirror in the array above
+ * (see the picture below) */
+};
+
+/*
+ Table of activated subvolumes:
+
+ ******* <- @mslots
+ ooo o o
+ o o
+ o
+
+ * - original subvolumes
+ o - replicas
+
+ An original subvolume with all its replicas are called mirrors.
+ An original subvolume always have mirror_id = 0. Replicas have
+ mirror_id > 0.
+*/
+
+/*
+ * In-memory header of compound (logical) volume.
+ */
+struct reiser4_volume {
+ struct list_head list;
+ u8 uuid[16]; /* volume id */
+ int num_sgs_bits; /* logarithm of number of hash space segments */
+ int stripe_bits; /* logarithm of stripe size */
+ atomic_t nr_origins; /* number of original subvolumes (w/o replicas) */
+ distribution_plugin *dist_plug;
+ struct rw_semaphore volume_sem; /* protect volume configuration */
+ struct rw_semaphore brick_removal_sem;
+ volume_plugin *vol_plug;
+ reiser4_dcx dcx; /* distribution context */
+ reiser4_volinfo volinfo[2]; /* on-disk volume configurations: current
+ and new (for volume operations). They
+ need co-exist some time until we make
+ sure that new info is written to disk
+ successfully */
+ struct list_head subvols_list; /* list of registered subvolumes */
+ bucket_t *buckets; /* set of abstract buckets */
+ struct lv_conf *conf; /* current working in-memory volume
+ configuration */
+ struct lv_conf *new_conf; /* new volume configuration */
+ reiser4_subvol *proxy; /* burst buffers */
+ reiser4_subvol *victim; /* brick to be removed from the volume */
+};
+
+typedef enum {
+ VBF_MIGRATE_ALL = 0x1,
+ VBF_CLR_IMMOBILE = 0x2
+} volume_balancing_flags;
+
+extern reiser4_super_info_data *get_super_private_nocheck(const struct
+ super_block *super);
+
+/* Return reiser4-specific part of super block */
+static inline reiser4_super_info_data *get_super_private(const struct
+ super_block *super)
+{
+ assert("nikita-447", super != NULL);
+
+ return (reiser4_super_info_data *) super->s_fs_info;
+}
+
+static inline reiser4_volume *super_volume(const struct super_block *super)
+{
+ return get_super_private(super)->vol;
+}
+
+static inline volume_plugin *super_vol_plug(const struct super_block *super)
+{
+ return super_volume(super)->vol_plug;
+}
+
+static inline lv_conf *sbinfo_conf(reiser4_super_info_data *info)
+{
+ assert("edward-1719", info != NULL);
+ assert("edward-1720", info->vol != NULL);
+
+ return info->vol->conf;
+}
+
+static inline slot_t *conf_mslots(lv_conf *conf)
+{
+ return conf->mslots;
+}
+
+static inline u32 conf_nr_mslots(lv_conf *conf)
+{
+ return conf->nr_mslots;
+}
+
+static inline mirror_t *conf_mslot_at(lv_conf *conf, u32 id)
+{
+ return conf_mslots(conf)[id];
+}
+
+static inline reiser4_subvol *conf_mirror(lv_conf *conf,
+ u32 slot_idx, u32 mirr_id)
+{
+ assert("edward-2473", conf_mslot_at(conf, slot_idx) != NULL);
+
+ return ((mirror_t *)conf_mslot_at(conf, slot_idx))[mirr_id];
+}
+
+static inline reiser4_subvol *conf_origin(lv_conf *conf, u32 subv_id)
+{
+ return conf_mirror(conf, subv_id, 0);
+}
+
+static inline lv_conf *super_conf(const struct super_block *sb)
+{
+ return sbinfo_conf(get_super_private(sb));
+}
+
+static inline u32 vol_nr_origins(reiser4_volume *vol)
+{
+ return atomic_read(&vol->nr_origins);
+}
+
+static inline u32 sbinfo_nr_origins(reiser4_super_info_data *info)
+{
+ return vol_nr_origins(info->vol);
+}
+
+/**
+ * Return a pointer to a subvolume.
+ * The caller should have a guarantee that subvolume will be valid
+ * while working with it.
+ */
+static inline reiser4_subvol *super_mirror(const struct super_block *super,
+ u32 slot_idx, u32 mirror_id)
+{
+ lv_conf *conf;
+ reiser4_subvol *ret;
+ reiser4_volume *vol = super_volume(super);
+
+ rcu_read_lock();
+ conf = rcu_dereference(vol->conf);
+ ret = conf_mirror(conf, slot_idx, mirror_id);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline reiser4_subvol *super_origin(const struct super_block *super,
+ u32 id)
+{
+ return super_mirror(super, id, 0);
+}
+
+static inline u32 super_nr_origins(const struct super_block *super)
+{
+ return sbinfo_nr_origins(get_super_private(super));
+}
+
+/* get ent context for the @super */
+static inline entd_context *get_entd_context(struct super_block *super)
+{
+ return &get_super_private(super)->entd;
+}
+
+/**
+ * Get the super block used during current system call.
+ * Reference to this super block is stored in reiser4_context
+ */
+static inline struct super_block *reiser4_get_current_sb(void)
+{
+ return get_current_context()->super;
+}
+
+/**
+ * Reiser4-specific part of "current" super-block: main super block used
+ * during current system call. Reference to this super block is stored in
+ * reiser4_context
+ */
+static inline reiser4_super_info_data *get_current_super_private(void)
+{
+ return get_super_private(reiser4_get_current_sb());
+}
+
+static inline reiser4_volume *current_volume(void)
+{
+ assert("edward-2158", get_current_super_private() != NULL);
+
+ return get_current_super_private()->vol;
+}
+
+static inline volume_plugin *current_vol_plug(void)
+{
+ return current_volume()->vol_plug;
+}
+
+static inline lv_conf *current_lv_conf(void)
+{
+ return sbinfo_conf(get_current_super_private());
+}
+
+static inline bucket_t *current_buckets(void)
+{
+ return current_volume()->buckets;
+}
+
+static inline struct bucket_ops *current_bucket_ops(void)
+{
+ return &current_volume()->vol_plug->bucket_ops;
+}
+
+static inline struct formatted_ra_params *get_current_super_ra_params(void)
+{
+ return &(get_current_super_private()->ra_params);
+}
+
+static inline struct distribution_plugin *current_dist_plug(void)
+{
+ return get_current_super_private()->vol->dist_plug;
+}
+
+static inline struct reiser4_subvol *current_mirror(u32 slot_idx,
+ u32 mirror_id)
+{
+ return super_mirror(reiser4_get_current_sb(), slot_idx, mirror_id);
+}
+
+static inline struct reiser4_subvol *current_origin(u32 slot_idx)
+{
+ return current_mirror(slot_idx, 0);
+}
+
+static inline u32 current_nr_origins(void)
+{
+ return sbinfo_nr_origins(get_current_super_private());
+}
+
+static inline u32 current_num_replicas(u32 orig_id)
+{
+ assert("edward-1723", current_origin(orig_id) != NULL);
+
+ return current_origin(orig_id)->num_replicas;
+}
+
+static inline u32 subvol_num_mirrors(reiser4_subvol *subv)
+{
+ assert("edward-1724", subv != NULL);
+ return 1 + subv->num_replicas;
+}
+
+static inline u32 current_num_mirrors(u32 orig_id)
+{
+ return 1 + current_num_replicas(orig_id);
+}
+
+#define current_stripe_bits (current_volume()->stripe_bits)
+#define current_stripe_size (1 << current_stripe_bits)
+
+#define for_each_mslot(_conf, _subv_id) \
+ for (_subv_id = 0; \
+ _subv_id < _conf->nr_mslots; \
+ _subv_id ++)
+
+#define for_each_data_mslot(_conf, _subv_id) \
+ for (_subv_id = 1; \
+ _subv_id < _conf->nr_mslots; \
+ _subv_id ++)
+
+#define for_each_mirror(_orig_id, _mirr_id) \
+ for (_mirr_id = 0; \
+ _mirr_id < current_num_mirrors(_orig_id); \
+ _mirr_id ++)
+
+#define for_each_replica(_orig_id, _mirr_id) \
+ for (_mirr_id = 1; \
+ _mirr_id < current_num_mirrors(_orig_id); \
+ _mirr_id ++)
+
+#define __for_each_mirror(_orig, _mirr_id) \
+ for (_mirr_id = 0; \
+ _mirr_id < subvol_num_mirrors(_orig); \
+ _mirr_id ++)
+
+#define __for_each_replica(_orig, _mirr_id) \
+ for (_mirr_id = 1; \
+ _mirr_id < subvol_num_mirrors(_orig); \
+ _mirr_id ++)
+
+#define DEFAULT_WRITE_GRANULARITY 32 /* always a power of 2 */
+
+static inline int is_replica(struct reiser4_subvol *subv)
+{
+ assert("edward-1725", subv != NULL);
+
+ return subv->mirror_id;
+}
+
+static inline int is_origin(struct reiser4_subvol *subv)
+{
+ assert("edward-1726", subv != NULL);
+
+ return !is_replica(subv);
+}
+
+static inline int has_replicas(struct reiser4_subvol *subv)
+{
+ assert("edward-1727", subv != NULL);
+
+ return subv->num_replicas;
+}
+
+/*
+ * true, if file system where @node lives on, is read-only
+ */
+static inline int rofs_jnode(jnode *node)
+{
+ return sb_rdonly(jnode_get_super(node));
+}
+
+extern void build_object_ops(struct super_block *super, struct object_ops *ops);
+
+#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
+
+static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
+{
+ spin_lock(&(sbinfo->guard));
+}
+
+static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
+{
+ assert_spin_locked(&(sbinfo->guard));
+ spin_unlock(&(sbinfo->guard));
+}
+
+static inline void __init_ch_sub(struct commit_handle_subvol *ch_sub)
+{
+ memset(ch_sub, 0, sizeof(*ch_sub));
+ INIT_LIST_HEAD(&ch_sub->overwrite_set);
+ INIT_LIST_HEAD(&ch_sub->tx_list);
+ INIT_LIST_HEAD(&ch_sub->wander_map);
+}
+
+static inline void __read_lock_tree(reiser4_super_info_data *sbinfo)
+{
+ /* check that tree is not locked */
+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
+ LOCK_CNT_NIL(read_locked_tree) &&
+ LOCK_CNT_NIL(write_locked_tree)));
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(spin_locked_stack)));
+
+ read_lock(&(sbinfo->tree_lock));
+
+ LOCK_CNT_INC(read_locked_tree);
+ LOCK_CNT_INC(rw_locked_tree);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void __read_unlock_tree(reiser4_super_info_data *sbinfo)
+{
+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(read_locked_tree);
+ LOCK_CNT_DEC(rw_locked_tree);
+ LOCK_CNT_DEC(spin_locked);
+
+ read_unlock(&(sbinfo->tree_lock));
+}
+
+static inline void __write_lock_tree(reiser4_super_info_data *sbinfo)
+{
+ /* check that tree is not locked */
+ assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
+ LOCK_CNT_NIL(read_locked_tree) &&
+ LOCK_CNT_NIL(write_locked_tree)));
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(spin_locked_stack)));
+
+ write_lock(&(sbinfo->tree_lock));
+
+ LOCK_CNT_INC(write_locked_tree);
+ LOCK_CNT_INC(rw_locked_tree);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void __write_unlock_tree(reiser4_super_info_data *sbinfo)
+{
+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(write_locked_tree);
+ LOCK_CNT_DEC(rw_locked_tree);
+ LOCK_CNT_DEC(spin_locked);
+
+ write_unlock(&(sbinfo->tree_lock));
+}
+
+static inline void read_lock_tree(void)
+{
+ __read_lock_tree(get_current_super_private());
+}
+
+static inline void read_unlock_tree(void)
+{
+ __read_unlock_tree(get_current_super_private());
+}
+static inline void write_lock_tree(void)
+{
+ __write_lock_tree(get_current_super_private());
+}
+
+static inline void write_unlock_tree(void)
+{
+ __write_unlock_tree(get_current_super_private());
+}
+
+/* set/clear/test per-volume flags */
+
+static inline int reiser4_is_set(const struct super_block *super,
+ reiser4_fs_flag f)
+{
+ return test_bit((int)f, &get_super_private(super)->fs_flags);
+}
+
+static inline int reiser4_volume_is_unbalanced(const struct super_block *sb)
+{
+ assert("edward-1945", sb != NULL);
+ return reiser4_is_set(sb, REISER4_UNBALANCED_VOL);
+}
+
+static inline void reiser4_volume_set_unbalanced(struct super_block *sb)
+{
+ assert("edward-1946", sb != NULL);
+ set_bit(REISER4_UNBALANCED_VOL, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_clear_unbalanced(struct super_block *sb)
+{
+ assert("edward-1948", sb != NULL);
+ clear_bit(REISER4_UNBALANCED_VOL, &get_super_private(sb)->fs_flags);
+}
+
+static inline int reiser4_volume_has_incomplete_removal(const struct super_block *sb)
+{
+ assert("edward-2247", sb != NULL);
+ return reiser4_is_set(sb, REISER4_INCOMPLETE_BRICK_REMOVAL);
+}
+
+static inline void reiser4_volume_set_incomplete_removal(struct super_block *sb)
+{
+ assert("edward-2248", sb != NULL);
+ set_bit(REISER4_INCOMPLETE_BRICK_REMOVAL, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_clear_incomplete_removal(struct super_block *sb)
+{
+ assert("edward-2249", sb != NULL);
+ clear_bit(REISER4_INCOMPLETE_BRICK_REMOVAL, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_set_activated(struct super_block *sb)
+{
+ assert("edward-2084", sb != NULL);
+ set_bit(REISER4_ACTIVATED_VOL, &get_super_private(sb)->fs_flags);
+}
+
+static inline int reiser4_volume_is_activated(struct super_block *sb)
+{
+ assert("edward-2085", sb != NULL);
+ return reiser4_is_set(sb, REISER4_ACTIVATED_VOL);
+}
+
+static inline void reiser4_volume_set_proxy_enabled(struct super_block *sb)
+{
+ assert("edward-2439", sb != NULL);
+ set_bit(REISER4_PROXY_ENABLED, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_clear_proxy_enabled(struct super_block *sb)
+{
+ assert("edward-2440", sb != NULL);
+ clear_bit(REISER4_PROXY_ENABLED, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_set_proxy_io(struct super_block *sb)
+{
+ assert("edward-2450", sb != NULL);
+ set_bit(REISER4_PROXY_IO, &get_super_private(sb)->fs_flags);
+}
+
+static inline void reiser4_volume_clear_proxy_io(struct super_block *sb)
+{
+ assert("edward-2451", sb != NULL);
+ clear_bit(REISER4_PROXY_IO, &get_super_private(sb)->fs_flags);
+}
+
+/* operations on subvolume */
+extern u64 get_meta_subvol_id(void);
+extern reiser4_subvol *get_meta_subvol(void);
+static inline reiser4_tree *meta_subvol_tree(void)
+{
+ return &get_meta_subvol()->tree;
+}
+
+extern reiser4_subvol *super_meta_subvol(struct super_block *super);
+
+#define find_data_subvol(coord) \
+current_origin(current_vol_plug()->find_brick(coord))
+
+static inline reiser4_subvol *calc_data_subvol(const struct inode *inode,
+ loff_t offset)
+{
+ reiser4_subvol *ret;
+ lv_conf *conf;
+ reiser4_volume *vol = current_volume();
+
+ rcu_read_lock();
+ conf = rcu_dereference(vol->conf);
+ ret = conf_origin(conf, vol->vol_plug->calc_brick(conf, inode, offset));
+ rcu_read_unlock();
+ return ret;
+}
+
+struct file_system_type *get_reiser4_fs_type(void);
+extern long reiser4_statfs_type(const struct super_block *super);
+
+extern __u64 reiser4_subvol_flush_reserved(const reiser4_subvol *);
+extern __u64 reiser4_subvol_block_count(const reiser4_subvol *);
+extern void reiser4_subvol_set_block_count(reiser4_subvol *subv, __u64 nr);
+extern __u64 reiser4_subvol_blocks_reserved(const reiser4_subvol *subv);
+
+extern __u64 reiser4_subvol_used_blocks(const reiser4_subvol *);
+extern void reiser4_subvol_set_used_blocks(reiser4_subvol *, __u64 nr);
+
+extern __u64 reiser4_subvol_min_blocks_used(const reiser4_subvol *);
+extern void reiser4_subvol_set_min_blocks_used(reiser4_subvol *, __u64 nr);
+
+extern __u64 reiser4_subvol_free_blocks(const reiser4_subvol *);
+extern void reiser4_subvol_set_free_blocks(reiser4_subvol *, __u64 nr);
+
+extern __u64 reiser4_subvol_data_capacity(reiser4_subvol *);
+extern void reiser4_subvol_set_data_capacity(reiser4_subvol *, __u64 len);
+
+extern __u64 reiser4_subvol_free_committed_blocks(const reiser4_subvol *);
+extern __u64 reiser4_subvol_grabbed_blocks(const reiser4_subvol *);
+extern __u64 reiser4_subvol_fake_allocated_fmt(const reiser4_subvol *);
+extern __u64 reiser4_subvol_fake_allocated_unf(const reiser4_subvol *);
+extern __u64 reiser4_subvol_clustered_blocks(const reiser4_subvol *);
+extern long reiser4_subvol_reserved4user(const reiser4_subvol *,
+ uid_t uid, gid_t gid);
+extern int reiser4_subvol_blocknr_is_sane(const reiser4_subvol *subv,
+ const reiser4_block_nr *blk);
+/* operations on volume */
+extern long reiser4_ioctl_volume(struct file *file,
+ unsigned int cmd, unsigned long arg,
+ int (*volume_op)(struct file *file,
+ struct reiser4_vol_op_args *args));
+extern int reiser4_volume_op_file(struct file *, struct reiser4_vol_op_args *);
+extern int reiser4_volume_op_dir(struct file *, struct reiser4_vol_op_args *);
+extern int reiser4_volume_header(struct reiser4_vol_op_args *);
+extern int reiser4_brick_header(struct reiser4_vol_op_args *);
+extern __u64 reiser4_volume_block_count(const struct super_block *);
+extern __u64 reiser4_volume_blocks_reserved(const struct super_block *super);
+extern __u64 reiser4_volume_free_blocks(const struct super_block *super);
+extern __u64 reiser4_volume_fake_allocated(const struct super_block *sb);
+extern long reiser4_volume_reserved4user(const struct super_block *,
+ uid_t uid, gid_t gid);
+extern __u32 reiser4_mkfs_id(const struct super_block *super, __u32 subv_id);
+extern reiser4_space_allocator * reiser4_get_space_allocator(reiser4_subvol *);
+extern reiser4_oid_allocator *
+reiser4_get_oid_allocator(const struct super_block *super);
+extern struct inode *reiser4_get_super_fake(const struct super_block *super);
+extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
+extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
+extern int is_reiser4_super(const struct super_block *super);
+extern int reiser4_done_super(struct super_block *s);
+extern int reiser4_scan_device(const char *path, fmode_t flags, void *holder,
+ reiser4_subvol **result, reiser4_volume **host);
+
+/* step of fill super */
+extern int reiser4_offline_op(struct reiser4_vol_op_args *);
+extern int reiser4_init_fs_info(struct super_block *);
+extern void reiser4_done_fs_info(struct super_block *);
+extern int reiser4_init_super_data(struct super_block *, char *opt_string);
+int reiser4_activate_subvol(struct super_block *super, reiser4_subvol *subv);
+void reiser4_deactivate_subvol(struct super_block *super, reiser4_subvol *subv);
+extern int reiser4_activate_volume(struct super_block *, u8 *vol_uuid);
+extern void reiser4_deactivate_volume(struct super_block *);
+extern void reiser4_unregister_subvol(struct reiser4_subvol *subv);
+extern int reiser4_unregister_brick(struct reiser4_vol_op_args *args);
+extern void reiser4_unregister_volumes(void);
+extern struct reiser4_volume *reiser4_search_volume(u8 *vol_uuid);
+extern int reiser4_read_master(struct super_block *, int silent, u8 *vol_uuid);
+extern int reiser4_init_root_inode(struct super_block *);
+extern reiser4_plugin *get_default_plugin(pset_member memb);
+
+#define INVALID_OID ((oid_t)0)
+/* Maximal possible object id. */
+#define ABSOLUTE_MAX_OID ((oid_t)~0)
+
+#define OIDS_RESERVED (1 << 16)
+int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
+oid_t oid_allocate(struct super_block *);
+int oid_release(struct super_block *, oid_t);
+oid_t oid_next(const struct super_block *);
+void oid_count_allocated(void);
+void oid_count_released(void);
+long oids_used(const struct super_block *);
+
+#if REISER4_DEBUG
+void print_fs_info(const char *prefix, const struct super_block *);
+#endif
+
+extern void destroy_reiser4_cache(struct kmem_cache **);
+
+extern struct super_operations reiser4_super_operations;
+extern struct export_operations reiser4_export_operations;
+extern struct dentry_operations reiser4_dentry_operations;
+
+/* __REISER4_SUPER_H__ */
+#endif
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 120
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/super_ops.c linux-5.10.2/fs/reiser4/super_ops.c
--- linux-5.10.2.orig/fs/reiser4/super_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/super_ops.c 2020-12-23 16:08:55.164816614 +0100
@@ -0,0 +1,921 @@
+/* Copyright 2005 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "flush.h"
+#include "safe_link.h"
+#include "checksum.h"
+#include "plugin/volume/volume.h"
+
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+/* slab cache for inodes */
+static struct kmem_cache *inode_cache;
+
+static struct dentry *reiser4_debugfs_root = NULL;
+
+/**
+ * init_once - constructor for reiser4 inodes
+ * @cache: cache @obj belongs to
+ * @obj: inode to be initialized
+ *
+ * Initialization function to be called when new page is allocated by reiser4
+ * inode cache. It is set on inode cache creation.
+ */
+static void init_once(void *obj)
+{
+ struct reiser4_inode_object *info;
+
+ info = obj;
+
+ /* initialize vfs inode */
+ inode_init_once(&info->vfs_inode);
+
+ /*
+ * initialize reiser4 specific part fo inode.
+ * NOTE-NIKITA add here initializations for locks, list heads,
+ * etc. that will be added to our private inode part.
+ */
+ INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
+ init_rwsem(&info->p.conv_sem);
+ /* init semaphore which is used during inode loading */
+ loading_init_once(&info->p);
+ INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
+ GFP_ATOMIC);
+#if REISER4_DEBUG
+ info->p.nr_jnodes = 0;
+#endif
+}
+
+/**
+ * init_inodes - create znode cache
+ *
+ * Initializes slab cache of inodes. It is part of reiser4 module initialization
+ */
+static int init_inodes(void)
+{
+ inode_cache = kmem_cache_create("reiser4_inode",
+ sizeof(struct reiser4_inode_object),
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, init_once);
+ if (inode_cache == NULL)
+ return RETERR(-ENOMEM);
+ return 0;
+}
+
+/**
+ * done_inodes - delete inode cache
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+static void done_inodes(void)
+{
+ destroy_reiser4_cache(&inode_cache);
+}
+
+/**
+ * reiser4_alloc_inode - alloc_inode of super operations
+ * @super: super block new inode is allocated for
+ *
+ * Allocates new inode, initializes reiser4 specific part of it.
+ */
+static struct inode *reiser4_alloc_inode(struct super_block *super)
+{
+ struct reiser4_inode_object *obj;
+
+ assert("nikita-1696", super != NULL);
+ obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
+ if (obj != NULL) {
+ reiser4_inode *info;
+
+ info = &obj->p;
+
+ info->pset = plugin_set_get_empty();
+ info->hset = plugin_set_get_empty();
+ info->extmask = 0;
+ info->locality_id = 0ull;
+ info->plugin_mask = 0;
+ info->heir_mask = 0;
+#if !REISER4_INO_IS_OID
+ info->oid_hi = 0;
+#endif
+ reiser4_seal_init(&info->sd_seal, NULL, NULL);
+ coord_init_invalid(&info->sd_coord, NULL);
+ info->flags = 0;
+ spin_lock_init(&info->guard);
+ /* this deals with info's loading semaphore */
+ loading_alloc(info);
+ info->vroot = UBER_TREE_ADDR;
+ return &obj->vfs_inode;
+ } else
+ return NULL;
+}
+
+/**
+ * reiser4_destroy_inode - destroy_inode of super operations
+ * @inode: inode being destroyed
+ *
+ * Puts reiser4 specific portion of inode, frees memory occupied by inode.
+ */
+static void reiser4_destroy_inode(struct inode *inode)
+{
+ reiser4_inode *info;
+
+ info = reiser4_inode_data(inode);
+
+ assert("vs-1220", inode_has_no_jnodes(info));
+
+ if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
+ file_plugin *fplug = inode_file_plugin(inode);
+ if (fplug->destroy_inode != NULL)
+ fplug->destroy_inode(inode);
+ }
+ reiser4_dispose_cursors(inode);
+ if (info->pset)
+ plugin_set_put(info->pset);
+ if (info->hset)
+ plugin_set_put(info->hset);
+
+ /*
+ * cannot add similar assertion about ->i_list as prune_icache return
+ * inode into slab with dangling ->list.{next,prev}. This is safe,
+ * because they are re-initialized in the new_inode().
+ */
+ assert("nikita-2895", hlist_empty(&inode->i_dentry));
+ assert("nikita-2896", hlist_unhashed(&inode->i_hash));
+ assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
+
+ /* this deals with info's loading semaphore */
+ loading_destroy(info);
+
+ kmem_cache_free(inode_cache,
+ container_of(info, struct reiser4_inode_object, p));
+}
+
+/**
+ * reiser4_dirty_inode - dirty_inode of super operations
+ * @inode: inode being dirtied
+ *
+ * Updates stat data.
+ */
+static void reiser4_dirty_inode(struct inode *inode, int flags)
+{
+ int result;
+ reiser4_context *ctx;
+
+ if (!is_in_reiser4_context())
+ return;
+ ctx = get_current_context();
+
+ if (ctx->ro) {
+ warning("edward-2200",
+ "failed to make inode %llu dirty (read-only FS)",
+ (unsigned long long)get_inode_oid(inode));
+ return;
+ }
+ assert("edward-1606", !IS_RDONLY(inode));
+ assert("edward-1607",
+ (inode_file_plugin(inode)->estimate.update(inode) <=
+ ctx_subvol_grabbed(ctx, get_meta_subvol()->id)));
+
+ if (ctx->locked_page)
+ unlock_page(ctx->locked_page);
+
+ result = reiser4_update_sd(inode);
+
+ if (ctx->locked_page)
+ lock_page(ctx->locked_page);
+ if (result)
+ warning("edward-1605", "failed to dirty inode for %llu: %d",
+ get_inode_oid(inode), result);
+}
+
+/**
+ * ->evict_inode() of super operations
+ * @inode: inode to delete
+ *
+ * Calls file plugin's delete_object method to delete object items from
+ * filesystem tree and calls clear_inode().
+ */
+static void reiser4_evict_inode(struct inode *inode)
+{
+ reiser4_context *ctx;
+ file_plugin *fplug;
+
+ ctx = reiser4_init_context(inode->i_sb);
+ if (IS_ERR(ctx)) {
+ warning("vs-15", "failed to init context");
+ return;
+ }
+
+ if (inode->i_nlink == 0 && is_inode_loaded(inode)) {
+ fplug = inode_file_plugin(inode);
+ if (fplug != NULL && fplug->delete_object != NULL)
+ fplug->delete_object(inode);
+ }
+
+ truncate_inode_pages_final(&inode->i_data);
+ inode->i_blocks = 0;
+ clear_inode(inode);
+ reiser4_exit_context(ctx);
+}
+
+/**
+ * reiser4_put_super - put_super of super operations
+ * @super: super block to free
+ *
+ * Stops daemons, release resources, umounts in short.
+ */
+static void reiser4_put_super(struct super_block *super)
+{
+ reiser4_super_info_data *sbinfo;
+ reiser4_context *ctx;
+
+ sbinfo = get_super_private(super);
+ assert("vs-1699", sbinfo);
+
+ debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
+ debugfs_remove(sbinfo->tmgr.debugfs_id_count);
+ debugfs_remove(sbinfo->debugfs_root);
+
+ ctx = reiser4_init_context(super);
+ if (IS_ERR(ctx)) {
+ warning("vs-17", "failed to init context");
+ return;
+ }
+ /*
+ * release disk format related resources
+ */
+ reiser4_deactivate_volume(super);
+ reiser4_jnodes_done();
+ reiser4_done_formatted_fake(super);
+ reiser4_done_csum_tfm(sbinfo->csum_tfm);
+
+ /* stop daemons: ktxnmgr and entd */
+ reiser4_done_entd(super);
+ reiser4_done_ktxnmgrd(super);
+ reiser4_done_txnmgr(&sbinfo->tmgr);
+
+ assert("edward-1890", list_empty(&get_super_private(super)->all_jnodes));
+ assert("edward-1891", get_current_context()->trans->atom == NULL);
+
+ reiser4_exit_context(ctx);
+ reiser4_done_fs_info(super);
+}
+
+/**
+ * reiser4_statfs - statfs of super operations
+ * @super: super block of file system in queried
+ * @stafs: buffer to fill with statistics
+ *
+ * Returns information about filesystem.
+ */
+static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
+{
+ sector_t total;
+ sector_t reserved;
+ sector_t free;
+ sector_t forroot;
+ sector_t deleted;
+ reiser4_context *ctx;
+ struct super_block *super = dentry->d_sb;
+
+ assert("nikita-408", super != NULL);
+ assert("nikita-409", statfs != NULL);
+
+ ctx = reiser4_init_context(super);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ statfs->f_type = reiser4_statfs_type(super);
+ statfs->f_bsize = super->s_blocksize;
+
+ /*
+ * 5% of total block space is reserved. This is needed for flush and
+ * for truncates (so that we are able to perform truncate/unlink even
+ * on the otherwise completely full file system). If this reservation
+ * is hidden from statfs(2), users will mistakenly guess that they
+ * have enough free space to complete some operation, which is
+ * frustrating.
+ *
+ * Another possible solution is to subtract ->blocks_reserved from
+ * ->f_bfree, but changing available space seems less intrusive than
+ * letting user to see 5% of disk space to be used directly after
+ * mkfs.
+ */
+ total = reiser4_volume_block_count(super);
+ reserved = reiser4_volume_blocks_reserved(super);
+ deleted = txnmgr_count_deleted_blocks();
+ free = reiser4_volume_free_blocks(super) + deleted;
+ forroot = reiser4_volume_reserved4user(super, 0, 0);
+
+ /*
+ * These counters may be in inconsistent state because we take the
+ * values without keeping any global spinlock. Here we do a sanity
+ * check that free block counter does not exceed the number of all
+ * blocks.
+ */
+ if (free > total)
+ free = total;
+ statfs->f_blocks = total - reserved;
+ /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
+ if (free > reserved)
+ free -= reserved;
+ else
+ free = 0;
+ statfs->f_bfree = free;
+
+ if (free > forroot)
+ free -= forroot;
+ else
+ free = 0;
+ statfs->f_bavail = free;
+
+ statfs->f_files = 0;
+ statfs->f_ffree = 0;
+ /*
+ * maximal acceptable name length depends on directory plugin
+ */
+ assert("nikita-3351", super->s_root->d_inode != NULL);
+ statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
+ reiser4_exit_context(ctx);
+ return 0;
+}
+
+/**
+ * reiser4_writeback_inodes - writeback_inodes of super operations
+ * @super:
+ * @wb:
+ * @wbc:
+ *
+ * This method is called by background and non-backgound writeback.
+ * Reiser4's implementation uses generic_writeback_sb_inodes to call
+ * reiser4_writepages_dispatch for each of dirty inodes.
+ * reiser4_writepages_dispatch handles pages dirtied via shared
+ * mapping - dirty pages get into atoms. Writeout is called to flush
+ * some atoms.
+ */
+static long reiser4_writeback_inodes(struct super_block *super,
+ struct bdi_writeback *wb,
+ struct writeback_control *wbc,
+ struct wb_writeback_work *work,
+ bool flush_all)
+{
+ long result;
+ reiser4_context *ctx;
+
+ if (wbc->for_kupdate)
+ /* reiser4 has its own means of periodical write-out */
+ goto skip;
+
+ spin_unlock(&wb->list_lock);
+ ctx = reiser4_init_context(super);
+ if (IS_ERR(ctx)) {
+ warning("vs-13", "failed to init context");
+ spin_lock(&wb->list_lock);
+ goto skip;
+ }
+ ctx->flush_bd_task = 1;
+ /*
+ * call reiser4_writepages for each of dirty inodes to turn
+ * dirty pages into transactions if they were not yet.
+ */
+ spin_lock(&wb->list_lock);
+ result = generic_writeback_sb_inodes(super, wb, wbc, work, flush_all);
+ spin_unlock(&wb->list_lock);
+
+ if (result <= 0)
+ goto exit;
+ wbc->nr_to_write = result;
+
+ /* flush goes here */
+ reiser4_writeout(super, wbc);
+ exit:
+ /* avoid recursive calls to ->writeback_inodes */
+ context_set_commit_async(ctx);
+ reiser4_exit_context(ctx);
+ spin_lock(&wb->list_lock);
+
+ return result;
+ skip:
+ writeback_skip_sb_inodes(super, wb);
+ return 0;
+}
+
+/**
+ * ->sync_fs() of super operations
+ */
+static int reiser4_sync_fs(struct super_block *super, int wait)
+{
+ reiser4_context *ctx;
+ struct bdi_writeback *wb;
+ struct wb_writeback_work work = {
+ .sb = super,
+ .sync_mode = WB_SYNC_ALL,
+ .range_cyclic = 0,
+ .nr_pages = LONG_MAX,
+ .reason = WB_REASON_SYNC,
+ .for_sync = 1,
+ };
+ struct writeback_control wbc = {
+ .sync_mode = work.sync_mode,
+ .range_cyclic = work.range_cyclic,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+ ctx = reiser4_init_context(super);
+ if (IS_ERR(ctx)) {
+ warning("edward-1567", "failed to init context");
+ return PTR_ERR(ctx);
+ }
+ /*
+ * We don't capture superblock here.
+ * Superblock is captured only by operations, which change
+ * its fields different from free_blocks, nr_files, next_oid.
+ * After system crash the mentioned fields are recovered from
+ * journal records, see reiser4_journal_recover_sb_data().
+ * Also superblock is captured at final commit when releasing
+ * disk format.
+ */
+ wb = &inode_to_bdi(reiser4_get_super_fake(super))->wb;
+ spin_lock(&wb->list_lock);
+ generic_writeback_sb_inodes(super, wb, &wbc, &work, true);
+ spin_unlock(&wb->list_lock);
+ wbc.nr_to_write = LONG_MAX;
+ /*
+ * (flush goes here)
+ * commit all transactions
+ */
+ reiser4_writeout(super, &wbc);
+
+ reiser4_exit_context(ctx);
+ return 0;
+}
+
+static int reiser4_remount(struct super_block *s, int *mount_flags, char *arg)
+{
+ sync_filesystem(s);
+ return 0;
+}
+
+/**
+ * reiser4_show_options - show_options of super operations
+ * @m: file where to write information
+ * @mnt: mount structure
+ *
+ * Makes reiser4 mount options visible in /proc/mounts.
+ */
+static int reiser4_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *super;
+ reiser4_super_info_data *sbinfo;
+
+ super = dentry->d_sb;
+ sbinfo = get_super_private(super);
+
+ seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
+ seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
+ seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
+ seq_printf(m, ",atom_max_flushers=0x%x",
+ sbinfo->tmgr.atom_max_flushers);
+ seq_printf(m, ",cbk_cache_slots=0x%x",
+ super_meta_subvol(super)->tree.cbk_cache.nr_slots);
+ return 0;
+}
+
+struct super_operations reiser4_super_operations = {
+ .alloc_inode = reiser4_alloc_inode,
+ .destroy_inode = reiser4_destroy_inode,
+ .dirty_inode = reiser4_dirty_inode,
+ .evict_inode = reiser4_evict_inode,
+ .put_super = reiser4_put_super,
+ .sync_fs = reiser4_sync_fs,
+ .statfs = reiser4_statfs,
+ .remount_fs = reiser4_remount,
+ .writeback_inodes = reiser4_writeback_inodes,
+ .show_options = reiser4_show_options
+};
+
+/**
+ * fill_super - initialize super block on mount.
+ * All subvolumes of the volume should be already registered in the system
+ *
+ * @super: super block to fill
+ * @data: reiser4 specific mount option
+ * @silent:
+ */
+static int fill_super(struct super_block *super, void *data, int silent)
+{
+ u32 subv_id;
+ reiser4_context ctx;
+ int result;
+ reiser4_super_info_data *sbinfo;
+ lv_conf *conf;
+ u8 vol_uuid[16];
+
+ assert("zam-989", super != NULL);
+
+ super->s_op = NULL;
+ /*
+ * context initialization will be completed after init_volume(),
+ * as we don't know number of subvolumes yet.
+ */
+ init_stack_context(&ctx, super);
+ /*
+ * allocate reiser4 private super info
+ */
+ if ((result = reiser4_init_fs_info(super)) != 0)
+ goto failed_init_sinfo;
+
+ sbinfo = get_super_private(super);
+
+ if ((result = reiser4_init_csum_tfm(&sbinfo->csum_tfm)) != 0)
+ goto failed_init_csum_tfm;
+
+ /* initialize various reiser4 parameters, parse mount options */
+ if ((result = reiser4_init_super_data(super, data)) != 0)
+ goto failed_init_super_data;
+
+ /* set filesystem blocksize */
+ if ((result = reiser4_read_master(super, silent, vol_uuid)) != 0)
+ goto failed_read_master;
+
+ /* initialize transaction manager */
+ reiser4_init_txnmgr(&sbinfo->tmgr);
+
+ /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
+ if ((result = reiser4_init_ktxnmgrd(super)) != 0)
+ goto failed_init_ktxnmgrd;
+
+ /* initialize entd context and start kernel thread entd */
+ if ((result = reiser4_init_entd(super)) != 0)
+ goto failed_init_entd;
+
+ /* initialize address spaces for formatted nodes and bitmaps */
+ if ((result = reiser4_init_formatted_fake(super)) != 0)
+ goto failed_init_formatted_fake;
+
+ /* initialize jnode hash table */
+ if ((result = reiser4_jnodes_init()) != 0)
+ goto failed_jnodes_init;
+
+ /* initialize disk formats of all subvolumes */
+ if ((result = reiser4_activate_volume(super, vol_uuid)) != 0)
+ goto failed_activate_volume;
+
+ sbinfo->default_uid = 0;
+ sbinfo->default_gid = 0;
+ sbinfo->nr_files_committed = oids_used(super);
+
+ /* calculate total number of blocks in the logical volume */
+ conf = sbinfo_conf(sbinfo);
+ for_each_mslot(conf, subv_id) {
+ if (!conf_mslot_at(conf, subv_id))
+ continue;
+ sbinfo->vol_block_count +=
+ conf_origin(conf, subv_id)->block_count;
+ }
+ /* get inode of root directory */
+ if ((result = reiser4_init_root_inode(super)) != 0)
+ goto failed_init_root_inode;
+
+ /* finish unfinished truncates */
+ if ((result = process_safelinks(super, get_meta_subvol()) != 0))
+ goto failed_process_safelinks;
+
+ reiser4_exit_context(&ctx);
+
+ sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
+ reiser4_debugfs_root);
+ if (sbinfo->debugfs_root) {
+ debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
+ sbinfo->debugfs_root,
+ &sbinfo->tmgr.atom_count);
+ debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
+ sbinfo->debugfs_root,
+ &sbinfo->tmgr.id_count);
+ }
+ return 0;
+ failed_process_safelinks:
+ dput(super->s_root);
+ failed_init_root_inode:
+ reiser4_deactivate_volume(super);
+ failed_activate_volume:
+ reiser4_jnodes_done();
+ failed_jnodes_init:
+ reiser4_done_formatted_fake(super);
+ failed_init_formatted_fake:
+ reiser4_done_entd(super);
+ failed_init_entd:
+ reiser4_done_ktxnmgrd(super);
+ failed_init_ktxnmgrd:
+ reiser4_done_txnmgr(&sbinfo->tmgr);
+ failed_read_master:
+ failed_init_super_data:
+ failed_init_csum_tfm:
+ reiser4_done_fs_info(super);
+ failed_init_sinfo:
+ reiser4_exit_context(&ctx);
+ return result;
+}
+
+/**
+ * reiser4_mount - mount of file_system_type operations
+ * @fs_type:
+ * @flags: mount flags
+ * @dev_name: block device file name
+ * @data: specific mount options
+ *
+ * Reiser4 mount entry.
+ */
+static struct dentry *reiser4_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ int ret;
+ reiser4_volume *host = NULL;
+ reiser4_subvol *subv = NULL;
+ /*
+ * the volume could be created by old version of reiser4progs,
+ * so try to register it here.
+ */
+ ret = reiser4_scan_device(dev_name, flags, fs_type, &subv, &host);
+ if (ret)
+ return ERR_PTR(ret);
+
+ assert("edward-1966", host != NULL);
+ assert("edward-1967", subv != NULL);
+
+ if (!is_meta_brick_id(subv->id)) {
+ /*
+ * Scan all registered bricks to find meta-data brick
+ */
+ subv = find_meta_brick_by_id(host);
+ if (subv == NULL) {
+ warning("edward-1968",
+ "%s: meta-data brick is not registered.",
+ dev_name);
+ return ERR_PTR(-EINVAL);
+ }
+ dev_name = subv->name;
+ }
+ return mount_bdev(fs_type, flags, dev_name, data, fill_super);
+}
+
+/* structure describing the reiser4 filesystem implementation */
+static struct file_system_type reiser4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "reiser4",
+ .fs_flags = FS_REQUIRES_DEV,
+ .mount = reiser4_mount,
+ .kill_sb = kill_block_super,
+ .next = NULL
+};
+
+void destroy_reiser4_cache(struct kmem_cache **cachep)
+{
+ BUG_ON(*cachep == NULL);
+ kmem_cache_destroy(*cachep);
+ *cachep = NULL;
+}
+
+struct file_system_type *get_reiser4_fs_type(void)
+{
+ return &reiser4_fs_type;
+}
+
+/**
+ * Used by volume.reiser4 to scan devices when no FS is mounted
+ */
+static long reiser4_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+ struct reiser4_vol_op_args *op_args;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case REISER4_IOC_SCAN_DEV:
+ op_args = memdup_user((void __user *)arg, sizeof(*op_args));
+ if (IS_ERR(op_args))
+ return PTR_ERR(op_args);
+
+ ret = reiser4_offline_op(op_args);
+ if (ret) {
+ ON_DEBUG(warning("edward-2315",
+ "off-line volume operation failed (%d)", ret));
+ kfree(op_args);
+ break;
+ }
+ if (copy_to_user((struct reiser4_vol_op_args __user *)arg,
+ op_args, sizeof(*op_args)))
+ ret = RETERR(-EFAULT);
+ kfree(op_args);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+ return ret;
+}
+
+static const struct file_operations reiser4_ctl_fops = {
+ .unlocked_ioctl = reiser4_control_ioctl,
+ .compat_ioctl = reiser4_control_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice reiser4_misc = {
+ .minor = REISER4_MINOR,
+ .name = "reiser4-control",
+ .fops = &reiser4_ctl_fops
+};
+
+MODULE_ALIAS_MISCDEV(REISER4_MINOR);
+MODULE_ALIAS("devname:reiser4-control");
+
+static int reiser4_interface_init(void)
+{
+ return misc_register(&reiser4_misc);
+}
+
+static void reiser4_interface_exit(void)
+{
+ misc_deregister(&reiser4_misc);
+}
+
+/**
+ * init_reiser4 - reiser4 initialization entry point
+ *
+ * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
+ * on kernel initialization or during reiser4 module load.
+ */
+static int __init init_reiser4(void)
+{
+ int result;
+
+ printk(KERN_INFO
+ "Loading Reiser4 (Software Framework Release: %d.%d.%d). "
+ "See reiser4.wiki.kernel.org for a description of Reiser4.\n",
+ get_release_number_principal(),
+ get_release_number_major(),
+ get_release_number_minor());
+
+ /* initialize slab cache of inodes */
+ if ((result = init_inodes()) != 0)
+ goto failed_inode_cache;
+
+ /* initialize cache of znodes */
+ if ((result = init_znodes()) != 0)
+ goto failed_init_znodes;
+
+ /* initialize all plugins */
+ if ((result = init_plugins()) != 0)
+ goto failed_init_plugins;
+
+ /* initialize cache of plugin_set-s and plugin_set's hash table */
+ if ((result = init_plugin_set()) != 0)
+ goto failed_init_plugin_set;
+
+ /* initialize caches of txn_atom-s and txn_handle-s */
+ if ((result = init_txnmgr_static()) != 0)
+ goto failed_init_txnmgr_static;
+
+ /* initialize cache of jnodes */
+ if ((result = init_jnodes()) != 0)
+ goto failed_init_jnodes;
+
+ /* initialize cache of flush queues */
+ if ((result = reiser4_init_fqs()) != 0)
+ goto failed_init_fqs;
+
+ /* initialize cache of structures attached to dentry->d_fsdata */
+ if ((result = reiser4_init_dentry_fsdata()) != 0)
+ goto failed_init_dentry_fsdata;
+
+ /* initialize cache of structures attached to file->private_data */
+ if ((result = reiser4_init_file_fsdata()) != 0)
+ goto failed_init_file_fsdata;
+ /*
+ * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
+ * more details
+ */
+ if ((result = reiser4_init_d_cursor()) != 0)
+ goto failed_init_d_cursor;
+
+ /* initialize cache of blocknr set entries */
+ if ((result = blocknr_set_init_static()) != 0)
+ goto failed_init_blocknr_set;
+
+ /* initialize cache of blocknr list entries */
+ if ((result = blocknr_list_init_static()) != 0)
+ goto failed_init_blocknr_list;
+
+ /* initialize cache of ctx_brick_info */
+ if ((result = ctx_brick_info_init_static()) != 0)
+ goto failed_init_ctx_brick_info;
+
+ /* initialize cache of ctx_stack_info */
+ if ((result = flush_init_static()) != 0)
+ goto failed_init_flush;
+
+ /* initialize interface */
+ if ((result = reiser4_interface_init()) != 0)
+ goto failed_init_interface;
+
+ if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
+ reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
+ return 0;
+ }
+ reiser4_interface_exit();
+ failed_init_interface:
+ done_flush_static();
+ failed_init_flush:
+ ctx_brick_info_done_static();
+ failed_init_ctx_brick_info:
+ blocknr_list_done_static();
+ failed_init_blocknr_list:
+ blocknr_set_done_static();
+ failed_init_blocknr_set:
+ reiser4_done_d_cursor();
+ failed_init_d_cursor:
+ reiser4_done_file_fsdata();
+ failed_init_file_fsdata:
+ reiser4_done_dentry_fsdata();
+ failed_init_dentry_fsdata:
+ reiser4_done_fqs();
+ failed_init_fqs:
+ done_jnodes();
+ failed_init_jnodes:
+ done_txnmgr_static();
+ failed_init_txnmgr_static:
+ done_plugin_set();
+ failed_init_plugin_set:
+ failed_init_plugins:
+ done_znodes();
+ failed_init_znodes:
+ done_inodes();
+ failed_inode_cache:
+ return result;
+}
+
+/**
+ * done_reiser4 - reiser4 exit entry point
+ *
+ * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
+ * or at module unload.
+ */
+static void __exit done_reiser4(void)
+{
+ int result;
+
+ debugfs_remove(reiser4_debugfs_root);
+ result = unregister_filesystem(&reiser4_fs_type);
+ BUG_ON(result != 0);
+ reiser4_interface_exit();
+ ctx_brick_info_done_static();
+ blocknr_list_done_static();
+ blocknr_set_done_static();
+ reiser4_done_d_cursor();
+ reiser4_done_file_fsdata();
+ reiser4_done_dentry_fsdata();
+ reiser4_done_fqs();
+ done_jnodes();
+ done_txnmgr_static();
+ done_plugin_set();
+ done_znodes();
+ destroy_reiser4_cache(&inode_cache);
+ reiser4_unregister_volumes();
+}
+
+module_init(init_reiser4);
+module_exit(done_reiser4);
+
+MODULE_ALIAS_FS("reiser4");
+
+MODULE_DESCRIPTION("Reiser4 filesystem");
+MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
+
+MODULE_LICENSE("GPL");
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tap.c linux-5.10.2/fs/reiser4/tap.c
--- linux-5.10.2.orig/fs/reiser4/tap.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tap.c 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,376 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/*
+ Tree Access Pointer (tap).
+
+ tap is data structure combining coord and lock handle (mostly). It is
+ useful when one has to scan tree nodes (for example, in readdir, or flush),
+ for tap functions allow to move tap in either direction transparently
+ crossing unit/item/node borders.
+
+ Tap doesn't provide automatic synchronization of its fields as it is
+ supposed to be per-thread object.
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "coord.h"
+#include "tree.h"
+#include "context.h"
+#include "tap.h"
+#include "znode.h"
+#include "tree_walk.h"
+
+#if REISER4_DEBUG
+static int tap_invariant(const tap_t *tap);
+static void tap_check(const tap_t *tap);
+#else
+#define tap_check(tap) noop
+#endif
+
+/** load node tap is pointing to, if not loaded already */
+int reiser4_tap_load(tap_t *tap)
+{
+ tap_check(tap);
+ if (tap->loaded == 0) {
+ int result;
+
+ result = zload_ra(tap->coord->node, &tap->ra_info);
+ if (result != 0)
+ return result;
+ coord_clear_iplug(tap->coord);
+ }
+ ++tap->loaded;
+ tap_check(tap);
+ return 0;
+}
+
+/** release node tap is pointing to. Dual to tap_load() */
+void reiser4_tap_relse(tap_t *tap)
+{
+ tap_check(tap);
+ if (tap->loaded > 0) {
+ --tap->loaded;
+ if (tap->loaded == 0)
+ zrelse(tap->coord->node);
+ }
+ tap_check(tap);
+}
+
+/**
+ * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
+ * @mode
+ */
+void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
+ znode_lock_mode mode)
+{
+ tap->coord = coord;
+ tap->lh = lh;
+ tap->mode = mode;
+ tap->loaded = 0;
+ INIT_LIST_HEAD(&tap->linkage);
+ reiser4_init_ra_info(&tap->ra_info);
+}
+
+/** add @tap to the per-thread list of all taps */
+void reiser4_tap_monitor(tap_t *tap)
+{
+ assert("nikita-2623", tap != NULL);
+ tap_check(tap);
+ list_add(&tap->linkage, reiser4_taps_list());
+ tap_check(tap);
+}
+
+/* duplicate @src into @dst. Copy lock handle. @dst is not initially
+ * loaded. */
+void reiser4_tap_copy(tap_t *dst, tap_t *src)
+{
+ assert("nikita-3193", src != NULL);
+ assert("nikita-3194", dst != NULL);
+
+ *dst->coord = *src->coord;
+ if (src->lh->node)
+ copy_lh(dst->lh, src->lh);
+ dst->mode = src->mode;
+ dst->loaded = 0;
+ INIT_LIST_HEAD(&dst->linkage);
+ dst->ra_info = src->ra_info;
+}
+
+/** finish with @tap */
+void reiser4_tap_done(tap_t *tap)
+{
+ assert("nikita-2565", tap != NULL);
+ tap_check(tap);
+ if (tap->loaded > 0)
+ zrelse(tap->coord->node);
+ done_lh(tap->lh);
+ tap->loaded = 0;
+ list_del_init(&tap->linkage);
+ tap->coord->node = NULL;
+}
+
+/**
+ * move @tap to the new node, locked with @target. Load @target, if @tap was
+ * already loaded.
+ */
+int reiser4_tap_move(tap_t *tap, lock_handle * target)
+{
+ int result = 0;
+
+ assert("nikita-2567", tap != NULL);
+ assert("nikita-2568", target != NULL);
+ assert("nikita-2570", target->node != NULL);
+ assert("nikita-2569", tap->coord->node == tap->lh->node);
+
+ tap_check(tap);
+ if (tap->loaded > 0)
+ result = zload_ra(target->node, &tap->ra_info);
+
+ if (result == 0) {
+ if (tap->loaded > 0)
+ zrelse(tap->coord->node);
+ done_lh(tap->lh);
+ copy_lh(tap->lh, target);
+ tap->coord->node = target->node;
+ coord_clear_iplug(tap->coord);
+ }
+ tap_check(tap);
+ return result;
+}
+
+/**
+ * move @tap to @target. Acquire lock on @target, if @tap was already
+ * loaded.
+ */
+static int tap_to(tap_t *tap, znode * target)
+{
+ int result;
+
+ assert("nikita-2624", tap != NULL);
+ assert("nikita-2625", target != NULL);
+
+ tap_check(tap);
+ result = 0;
+ if (tap->coord->node != target) {
+ lock_handle here;
+
+ init_lh(&here);
+ result = longterm_lock_znode(&here, target,
+ tap->mode, ZNODE_LOCK_HIPRI);
+ if (result == 0) {
+ result = reiser4_tap_move(tap, &here);
+ done_lh(&here);
+ }
+ }
+ tap_check(tap);
+ return result;
+}
+
+/**
+ * move @tap to given @target, loading and locking @target->node if
+ * necessary
+ */
+int tap_to_coord(tap_t *tap, coord_t *target)
+{
+ int result;
+
+ tap_check(tap);
+ result = tap_to(tap, target->node);
+ if (result == 0)
+ coord_dup(tap->coord, target);
+ tap_check(tap);
+ return result;
+}
+
+/** return list of all taps */
+struct list_head *reiser4_taps_list(void)
+{
+ return &get_current_context()->taps;
+}
+
+/** helper function for go_{next,prev}_{item,unit,node}() */
+int go_dir_el(tap_t *tap, sideof dir, int units_p)
+{
+ coord_t dup;
+ coord_t *coord;
+ int result;
+
+ int (*coord_dir) (coord_t *);
+ int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
+ void (*coord_init) (coord_t *, const znode *);
+ ON_DEBUG(int (*coord_check) (const coord_t *));
+
+ assert("nikita-2556", tap != NULL);
+ assert("nikita-2557", tap->coord != NULL);
+ assert("nikita-2558", tap->lh != NULL);
+ assert("nikita-2559", tap->coord->node != NULL);
+
+ tap_check(tap);
+ if (dir == LEFT_SIDE) {
+ coord_dir = units_p ? coord_prev_unit : coord_prev_item;
+ get_dir_neighbor = reiser4_get_left_neighbor;
+ coord_init = coord_init_last_unit;
+ } else {
+ coord_dir = units_p ? coord_next_unit : coord_next_item;
+ get_dir_neighbor = reiser4_get_right_neighbor;
+ coord_init = coord_init_first_unit;
+ }
+ ON_DEBUG(coord_check =
+ units_p ? coord_is_existing_unit : coord_is_existing_item);
+ assert("nikita-2560", coord_check(tap->coord));
+
+ coord = tap->coord;
+ coord_dup(&dup, coord);
+ if (coord_dir(&dup) != 0) {
+ do {
+ /* move to the left neighboring node */
+ lock_handle dup;
+
+ init_lh(&dup);
+ result =
+ get_dir_neighbor(&dup, coord->node, (int)tap->mode,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result == 0) {
+ result = reiser4_tap_move(tap, &dup);
+ if (result == 0)
+ coord_init(tap->coord, dup.node);
+ done_lh(&dup);
+ }
+ /* skip empty nodes */
+ } while ((result == 0) && node_is_empty(coord->node));
+ } else {
+ result = 0;
+ coord_dup(coord, &dup);
+ }
+ assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
+ tap_check(tap);
+ return result;
+}
+
+/**
+ * move @tap to the next unit, transparently crossing item and node
+ * boundaries
+ */
+int go_next_unit(tap_t *tap)
+{
+ return go_dir_el(tap, RIGHT_SIDE, 1);
+}
+
+/**
+ * move @tap to the previous unit, transparently crossing item and node
+ * boundaries
+ */
+int go_prev_unit(tap_t *tap)
+{
+ return go_dir_el(tap, LEFT_SIDE, 1);
+}
+
+/**
+ * @shift times apply @actor to the @tap. This is used to move @tap by
+ * @shift units (or items, or nodes) in either direction.
+ */
+static int rewind_to(tap_t *tap, go_actor_t actor, int shift)
+{
+ int result;
+
+ assert("nikita-2555", shift >= 0);
+ assert("nikita-2562", tap->coord->node == tap->lh->node);
+
+ tap_check(tap);
+ result = reiser4_tap_load(tap);
+ if (result != 0)
+ return result;
+
+ for (; shift > 0; --shift) {
+ result = actor(tap);
+ assert("nikita-2563", tap->coord->node == tap->lh->node);
+ if (result != 0)
+ break;
+ }
+ reiser4_tap_relse(tap);
+ tap_check(tap);
+ return result;
+}
+
+/** move @tap @shift units rightward */
+int rewind_right(tap_t *tap, int shift)
+{
+ return rewind_to(tap, go_next_unit, shift);
+}
+
+/** move @tap @shift units leftward */
+int rewind_left(tap_t *tap, int shift)
+{
+ return rewind_to(tap, go_prev_unit, shift);
+}
+
+#if REISER4_DEBUG
+/** debugging function: print @tap content in human readable form */
+static void print_tap(const char *prefix, const tap_t *tap)
+{
+ if (tap == NULL) {
+ printk("%s: null tap\n", prefix);
+ return;
+ }
+ printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
+ tap->loaded, (&tap->linkage == tap->linkage.next &&
+ &tap->linkage == tap->linkage.prev),
+ tap->lh->node,
+ lock_mode_name(tap->mode));
+ print_coord("\tcoord", tap->coord, 0);
+}
+
+/** check [tap-sane] invariant */
+static int tap_invariant(const tap_t *tap)
+{
+ /* [tap-sane] invariant */
+
+ if (tap == NULL)
+ return 1;
+ /* tap->mode is one of
+ *
+ * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
+ */
+ if (tap->mode != ZNODE_NO_LOCK &&
+ tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
+ return 2;
+ /* tap->coord != NULL, and */
+ if (tap->coord == NULL)
+ return 3;
+ /* tap->lh != NULL, and */
+ if (tap->lh == NULL)
+ return 4;
+ /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
+ if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
+ return 5;
+ /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
+ if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
+ return 6;
+ return 0;
+}
+
+/** debugging function: check internal @tap consistency */
+static void tap_check(const tap_t *tap)
+{
+ int result;
+
+ result = tap_invariant(tap);
+ if (result != 0) {
+ print_tap("broken", tap);
+ reiser4_panic("nikita-2831", "tap broken: %i\n", result);
+ }
+}
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tap.h linux-5.10.2/fs/reiser4/tap.h
--- linux-5.10.2.orig/fs/reiser4/tap.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tap.h 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,70 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* Tree Access Pointers. See tap.c for more details. */
+
+#if !defined(__REISER4_TAP_H__)
+#define __REISER4_TAP_H__
+
+#include "forward.h"
+#include "readahead.h"
+
+/**
+ tree_access_pointer aka tap. Data structure combining coord_t and lock
+ handle.
+ Invariants involving this data-type, see doc/lock-ordering for details:
+
+ [tap-sane]
+ */
+struct tree_access_pointer {
+ /* coord tap is at */
+ coord_t *coord;
+ /* lock handle on ->coord->node */
+ lock_handle *lh;
+ /* mode of lock acquired by this tap */
+ znode_lock_mode mode;
+ /* incremented by reiser4_tap_load().
+ Decremented by reiser4_tap_relse(). */
+ int loaded;
+ /* list of taps */
+ struct list_head linkage;
+ /* read-ahead hint */
+ ra_info_t ra_info;
+};
+
+typedef int (*go_actor_t) (tap_t *tap);
+
+extern int reiser4_tap_load(tap_t *tap);
+extern void reiser4_tap_relse(tap_t *tap);
+extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
+ znode_lock_mode mode);
+extern void reiser4_tap_monitor(tap_t *tap);
+extern void reiser4_tap_copy(tap_t *dst, tap_t *src);
+extern void reiser4_tap_done(tap_t *tap);
+extern int reiser4_tap_move(tap_t *tap, lock_handle * target);
+extern int tap_to_coord(tap_t *tap, coord_t *target);
+
+extern int go_dir_el(tap_t *tap, sideof dir, int units_p);
+extern int go_next_unit(tap_t *tap);
+extern int go_prev_unit(tap_t *tap);
+extern int rewind_right(tap_t *tap, int shift);
+extern int rewind_left(tap_t *tap, int shift);
+
+extern struct list_head *reiser4_taps_list(void);
+
+#define for_all_taps(tap) \
+ for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage); \
+ reiser4_taps_list() != &tap->linkage; \
+ tap = list_entry(tap->linkage.next, tap_t, linkage))
+
+/* __REISER4_TAP_H__ */
+#endif
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree.c linux-5.10.2/fs/reiser4/tree.c
--- linux-5.10.2.orig/fs/reiser4/tree.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree.c 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,1964 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/*
+ * KEYS IN A TREE.
+ *
+ * The tree consists of nodes located on the disk. Node in the tree is either
+ * formatted or unformatted. Formatted node is one that has structure
+ * understood by the tree balancing and traversal code. Formatted nodes are
+ * further classified into leaf and internal nodes. Latter distinctions is
+ * (almost) of only historical importance: general structure of leaves and
+ * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
+ * that are part of bodies of ordinary files and attributes.
+ *
+ * Each node in the tree spawns some interval in the key space. Key ranges for
+ * all nodes in the tree are disjoint. Actually, this only holds in some weak
+ * sense, because of the non-unique keys: intersection of key ranges for
+ * different nodes is either empty, or consists of exactly one key.
+ *
+ * Formatted node consists of a sequence of items. Each item spawns some
+ * interval in key space. Key ranges for all items in a tree are disjoint,
+ * modulo non-unique keys again. Items within nodes are ordered in the key
+ * order of the smallest key in a item.
+ *
+ * Particular type of item can be further split into units. Unit is piece of
+ * item that can be cut from item and moved into another item of the same
+ * time. Units are used by balancing code to repack data during balancing.
+ *
+ * Unit can be further split into smaller entities (for example, extent unit
+ * represents several pages, and it is natural for extent code to operate on
+ * particular pages and even bytes within one unit), but this is of no
+ * relevance to the generic balancing and lookup code.
+ *
+ * Although item is said to "spawn" range or interval of keys, it is not
+ * necessary that item contains piece of data addressable by each and every
+ * key in this range. For example, compound directory item, consisting of
+ * units corresponding to directory entries and keyed by hashes of file names,
+ * looks more as having "discrete spectrum": only some disjoint keys inside
+ * range occupied by this item really address data.
+ *
+ * No than less, each item always has well-defined least (minimal) key, that
+ * is recorded in item header, stored in the node this item is in. Also, item
+ * plugin can optionally define method ->max_key_inside() returning maximal
+ * key that can _possibly_ be located within this item. This method is used
+ * (mainly) to determine when given piece of data should be merged into
+ * existing item, in stead of creating new one. Because of this, even though
+ * ->max_key_inside() can be larger that any key actually located in the item,
+ * intervals
+ *
+ * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
+ *
+ * are still disjoint for all items within the _same_ node.
+ *
+ * In memory node is represented by znode. It plays several roles:
+ *
+ * . something locks are taken on
+ *
+ * . something tracked by transaction manager (this is going to change)
+ *
+ * . something used to access node data
+ *
+ * . something used to maintain tree structure in memory: sibling and
+ * parental linkage.
+ *
+ * . something used to organize nodes into "slums"
+ *
+ * More on znodes see in znode.[ch]
+ *
+ * DELIMITING KEYS
+ *
+ * To simplify balancing, allow some flexibility in locking and speed up
+ * important coord cache optimization, we keep delimiting keys of nodes in
+ * memory. Depending on disk format (implemented by appropriate node plugin)
+ * node on disk can record both left and right delimiting key, only one of
+ * them, or none. Still, our balancing and tree traversal code keep both
+ * delimiting keys for a node that is in memory stored in the znode. When
+ * node is first brought into memory during tree traversal, its left
+ * delimiting key is taken from its parent, and its right delimiting key is
+ * either next key in its parent, or is right delimiting key of parent if
+ * node is the rightmost child of parent.
+ *
+ * Physical consistency of delimiting key is protected by special dk
+ * read-write lock. That is, delimiting keys can only be inspected or
+ * modified under this lock. But dk lock is only sufficient for fast
+ * "pessimistic" check, because to simplify code and to decrease lock
+ * contention, balancing (carry) only updates delimiting keys right before
+ * unlocking all locked nodes on the given tree level. For example,
+ * coord-by-key cache scans LRU list of recently accessed znodes. For each
+ * node it first does fast check under dk spin lock. If key looked for is
+ * not between delimiting keys for this node, next node is inspected and so
+ * on. If key is inside of the key range, long term lock is taken on node
+ * and key range is rechecked.
+ *
+ * COORDINATES
+ *
+ * To find something in the tree, you supply a key, and the key is resolved
+ * by coord_by_key() into a coord (coordinate) that is valid as long as the
+ * node the coord points to remains locked. As mentioned above trees
+ * consist of nodes that consist of items that consist of units. A unit is
+ * the smallest and indivisible piece of tree as far as balancing and tree
+ * search are concerned. Each node, item, and unit can be addressed by
+ * giving its level in the tree and the key occupied by this entity. A node
+ * knows what the key ranges are of the items within it, and how to find its
+ * items and invoke their item handlers, but it does not know how to access
+ * individual units within its items except through the item handlers.
+ * coord is a structure containing a pointer to the node, the ordinal number
+ * of the item within this node (a sort of item offset), and the ordinal
+ * number of the unit within this item.
+ *
+ * TREE LOOKUP
+ *
+ * There are two types of access to the tree: lookup and modification.
+ *
+ * Lookup is a search for the key in the tree. Search can look for either
+ * exactly the key given to it, or for the largest key that is not greater
+ * than the key given to it. This distinction is determined by "bias"
+ * parameter of search routine (coord_by_key()). coord_by_key() either
+ * returns error (key is not in the tree, or some kind of external error
+ * occurred), or successfully resolves key into coord.
+ *
+ * This resolution is done by traversing tree top-to-bottom from root level
+ * to the desired level. On levels above twig level (level one above the
+ * leaf level) nodes consist exclusively of internal items. Internal item is
+ * nothing more than pointer to the tree node on the child level. On twig
+ * level nodes consist of internal items intermixed with extent
+ * items. Internal items form normal search tree structure used by traversal
+ * to descent through the tree.
+ *
+ * TREE LOOKUP OPTIMIZATIONS
+ *
+ * Tree lookup described above is expensive even if all nodes traversed are
+ * already in the memory: for each node binary search within it has to be
+ * performed and binary searches are CPU consuming and tend to destroy CPU
+ * caches.
+ *
+ * Several optimizations are used to work around this:
+ *
+ * . cbk_cache (look-aside cache for tree traversals, see search.c for
+ * details)
+ *
+ * . seals (see seal.[ch])
+ *
+ * . vroot (see search.c)
+ *
+ * General search-by-key is layered thusly:
+ *
+ * [check seal, if any] --ok--> done
+ * |
+ * failed
+ * |
+ * V
+ * [vroot defined] --no--> node = tree_root
+ * | |
+ * yes |
+ * | |
+ * V |
+ * node = vroot |
+ * | |
+ * | |
+ * | |
+ * V V
+ * [check cbk_cache for key] --ok--> done
+ * |
+ * failed
+ * |
+ * V
+ * [start tree traversal from node]
+ *
+ */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/item/static_stat.h"
+#include "plugin/item/item.h"
+#include "plugin/node/node.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "carry.h"
+#include "carry_ops.h"
+#include "tap.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "page_cache.h"
+#include "super.h"
+#include "reiser4.h"
+#include "inode.h"
+
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/spinlock.h>
+
+/* Disk address (block number) never ever used for any real tree node. This is
+ used as block number of "uber" znode.
+
+ Invalid block addresses are 0 by tradition.
+
+*/
+const reiser4_block_nr UBER_TREE_ADDR = 0ull;
+
+#define CUT_TREE_MIN_ITERATIONS 64
+
+static int find_child_by_addr(znode * parent, znode * child, coord_t *result);
+
+/* return node plugin of coord->node */
+node_plugin *node_plugin_by_coord(const coord_t *coord)
+{
+ assert("vs-1", coord != NULL);
+ assert("vs-2", coord->node != NULL);
+
+ return coord->node->nplug;
+}
+
+/* insert item into tree. Fields of @coord are updated so that they can be
+ * used by consequent insert operation. */
+insert_result insert_by_key(reiser4_tree * tree /* tree to insert new item
+ * into */ ,
+ const reiser4_key * key /* key of new item */ ,
+ reiser4_item_data * data /* parameters for item
+ * creation */ ,
+ coord_t *coord /* resulting insertion coord */ ,
+ lock_handle * lh /* resulting lock
+ * handle */ ,
+ tree_level stop_level /* level where to insert */ ,
+ __u32 flags/* insertion flags */)
+{
+ int result;
+
+ assert("nikita-358", tree != NULL);
+ assert("nikita-360", coord != NULL);
+
+ result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
+ FIND_EXACT, stop_level, stop_level,
+ flags | CBK_FOR_INSERT, NULL/*ra_info */);
+ switch (result) {
+ default:
+ break;
+ case CBK_COORD_FOUND:
+ result = IBK_ALREADY_EXISTS;
+ break;
+ case CBK_COORD_NOTFOUND:
+ assert("nikita-2017", coord->node != NULL);
+ result = insert_by_coord(coord, data, key, lh, 0/*flags */);
+ break;
+ }
+ return result;
+}
+
+/* insert item by calling carry. Helper function called if short-cut
+ insertion failed */
+static insert_result insert_with_carry_by_coord(coord_t *coord,
+ /* coord where to insert */
+ lock_handle * lh,
+ /* lock handle of insertion node */
+ reiser4_item_data * data,
+ /* parameters of new item */
+ const reiser4_key * key,
+ /* key of new item */
+ carry_opcode cop,
+ /* carry operation to perform */
+ cop_insert_flag flags
+ /* carry flags */ )
+{
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ carry_insert_data *cdata;
+ carry_op *op;
+
+ assert("umka-314", coord != NULL);
+
+ /* allocate carry_pool and 3 carry_level-s */
+ pool =
+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(*cdata));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+
+ op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
+ if (IS_ERR(op) || (op == NULL)) {
+ done_carry_pool(pool);
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+ }
+ cdata = (carry_insert_data *) (lowest_level + 3);
+ cdata->coord = coord;
+ cdata->data = data;
+ cdata->key = key;
+ op->u.insert.d = cdata;
+ if (flags == 0)
+ flags = znode_get_tree(coord->node)->carry.insert_flags;
+ op->u.insert.flags = flags;
+ op->u.insert.type = COPT_ITEM_DATA;
+ op->u.insert.child = NULL;
+ if (lh != NULL) {
+ assert("nikita-3245", lh->node == coord->node);
+ lowest_level->track_type = CARRY_TRACK_CHANGE;
+ lowest_level->tracked = lh;
+ }
+
+ result = reiser4_carry(lowest_level, NULL);
+ done_carry_pool(pool);
+
+ return result;
+}
+
+/* form carry queue to perform paste of @data with @key at @coord, and launch
+ its execution by calling carry().
+
+ Instruct carry to update @lh it after balancing insertion coord moves into
+ different block.
+
+*/
+static int paste_with_carry(coord_t *coord, /* coord of paste */
+ lock_handle * lh, /* lock handle of node
+ * where item is
+ * pasted */
+ reiser4_item_data * data, /* parameters of new
+ * item */
+ const reiser4_key * key, /* key of new item */
+ unsigned flags/* paste flags */)
+{
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ carry_insert_data *cdata;
+ carry_op *op;
+
+ assert("umka-315", coord != NULL);
+ assert("umka-316", key != NULL);
+
+ pool =
+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(*cdata));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+
+ op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
+ if (IS_ERR(op) || (op == NULL)) {
+ done_carry_pool(pool);
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+ }
+ cdata = (carry_insert_data *) (lowest_level + 3);
+ cdata->coord = coord;
+ cdata->data = data;
+ cdata->key = key;
+ op->u.paste.d = cdata;
+ if (flags == 0)
+ flags = znode_get_tree(coord->node)->carry.paste_flags;
+ op->u.paste.flags = flags;
+ op->u.paste.type = COPT_ITEM_DATA;
+ if (lh != NULL) {
+ lowest_level->track_type = CARRY_TRACK_CHANGE;
+ lowest_level->tracked = lh;
+ }
+
+ result = reiser4_carry(lowest_level, NULL);
+ done_carry_pool(pool);
+
+ return result;
+}
+
+/* insert item at the given coord.
+
+ First try to skip carry by directly calling ->create_item() method of node
+ plugin. If this is impossible (there is not enough free space in the node,
+ or leftmost item in the node is created), call insert_with_carry_by_coord()
+ that will do full carry().
+
+*/
+insert_result insert_by_coord(coord_t *coord /* coord where to
+ * insert. coord->node has
+ * to be write locked by
+ * caller */ ,
+ reiser4_item_data * data /* data to be
+ * inserted */ ,
+ const reiser4_key * key /* key of new item */ ,
+ lock_handle * lh /* lock handle of write
+ * lock on node */ ,
+ __u32 flags/* insertion flags */)
+{
+ unsigned item_size;
+ int result;
+ znode *node;
+
+ assert("vs-247", coord != NULL);
+ assert("vs-248", data != NULL);
+ assert("vs-249", data->length >= 0);
+ assert("nikita-1191", znode_is_write_locked(coord->node));
+
+ node = coord->node;
+ coord_clear_iplug(coord);
+ result = zload(node);
+ if (result != 0)
+ return result;
+
+ item_size = space_needed(node, NULL, data, 1);
+ if (item_size > znode_free_space(node) &&
+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
+ && (flags & COPI_DONT_ALLOCATE)) {
+ /* we are forced to use free space of coord->node and new item
+ does not fit into it.
+
+ Currently we get here only when we allocate and copy units
+ of extent item from a node to its left neighbor during
+ "squalloc"-ing. If @node (this is left neighbor) does not
+ have enough free space - we do not want to attempt any
+ shifting and allocations because we are in squeezing and
+ everything to the left of @node is tightly packed.
+ */
+ result = -E_NODE_FULL;
+ } else if ((item_size <= znode_free_space(node)) &&
+ !coord_is_before_leftmost(coord) &&
+ (node_plugin_by_node(node)->fast_insert != NULL)
+ && node_plugin_by_node(node)->fast_insert(coord)) {
+ /* shortcut insertion without carry() overhead.
+
+ Only possible if:
+
+ - there is enough free space
+
+ - insertion is not into the leftmost position in a node
+ (otherwise it would require updating of delimiting key in a
+ parent)
+
+ - node plugin agrees with this
+
+ */
+ result =
+ node_plugin_by_node(node)->create_item(coord, key, data,
+ NULL);
+ znode_make_dirty(node);
+ } else {
+ /* otherwise do full-fledged carry(). */
+ result =
+ insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
+ flags);
+ }
+ zrelse(node);
+ return result;
+}
+
+/* @coord is set to leaf level and @data is to be inserted to twig level */
+insert_result
+insert_extent_by_coord(coord_t *coord, /* coord where to insert.
+ * coord->node has to be write
+ * locked by caller */
+ reiser4_item_data *data,/* data to be inserted */
+ const reiser4_key *key, /* key of new item */
+ lock_handle *lh /* lock handle of write lock
+ on node */)
+{
+ assert("vs-405", coord != NULL);
+ assert("vs-406", data != NULL);
+ assert("vs-407", data->length > 0);
+ assert("vs-408", znode_is_write_locked(coord->node));
+ assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
+
+ return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
+ 0 /*flags */ );
+}
+
+/* Insert into the item at the given coord.
+
+ First try to skip carry by directly calling ->paste() method of item
+ plugin. If this is impossible (there is not enough free space in the node,
+ or we are pasting into leftmost position in the node), call
+ paste_with_carry() that will do full carry().
+
+*/
+/* paste_into_item */
+int insert_into_item(coord_t * coord /* coord of pasting */ ,
+ lock_handle * lh /* lock handle on node involved */ ,
+ const reiser4_key * key /* key of unit being pasted */ ,
+ reiser4_item_data * data /* parameters for new unit */ ,
+ unsigned flags /* insert/paste flags */ )
+{
+ int result;
+ int size_change;
+ node_plugin *nplug;
+ item_plugin *iplug;
+
+ assert("umka-317", coord != NULL);
+ assert("umka-318", key != NULL);
+
+ iplug = item_plugin_by_coord(coord);
+ nplug = node_plugin_by_coord(coord);
+
+ assert("nikita-1480", iplug == data->iplug);
+
+ size_change = space_needed(coord->node, coord, data, 0);
+ if (size_change > (int)znode_free_space(coord->node) &&
+ (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
+ && (flags & COPI_DONT_ALLOCATE)) {
+ /* we are forced to use free space of coord->node and new data
+ does not fit into it. */
+ return -E_NODE_FULL;
+ }
+
+ /* shortcut paste without carry() overhead.
+
+ Only possible if:
+
+ - there is enough free space
+
+ - paste is not into the leftmost unit in a node (otherwise
+ it would require updating of delimiting key in a parent)
+
+ - node plugin agrees with this
+
+ - item plugin agrees with us
+ */
+ if (size_change <= (int)znode_free_space(coord->node) &&
+ (coord->item_pos != 0 ||
+ coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
+ coord->unit_pos != 0 && nplug->fast_paste != NULL &&
+ nplug->fast_paste(coord) &&
+ iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
+ if (size_change > 0)
+ nplug->change_item_size(coord, size_change);
+ /* NOTE-NIKITA: huh? where @key is used? */
+ result = iplug->b.paste(coord, data, NULL);
+ if (size_change < 0)
+ nplug->change_item_size(coord, size_change);
+ znode_make_dirty(coord->node);
+ } else
+ /* otherwise do full-fledged carry(). */
+ result = paste_with_carry(coord, lh, data, key, flags);
+ return result;
+}
+
+/* this either appends or truncates item @coord */
+int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
+ reiser4_item_data * data /* parameters of resize */ ,
+ reiser4_key * key /* key of new unit */ ,
+ lock_handle * lh /* lock handle of node
+ * being modified */ ,
+ cop_insert_flag flags /* carry flags */ )
+{
+ int result;
+ znode *node;
+
+ assert("nikita-362", coord != NULL);
+ assert("nikita-363", data != NULL);
+ assert("vs-245", data->length != 0);
+
+ node = coord->node;
+ coord_clear_iplug(coord);
+ result = zload(node);
+ if (result != 0)
+ return result;
+
+ if (data->length < 0)
+ result = node_plugin_by_coord(coord)->shrink_item(coord,
+ -data->length);
+ else
+ result = insert_into_item(coord, lh, key, data, flags);
+
+ zrelse(node);
+ return result;
+}
+
+/* insert flow @f */
+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
+{
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ reiser4_item_data *data;
+ carry_op *op;
+
+ pool =
+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(*data));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+
+ op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
+ 0 /* operate directly on coord -> node */ );
+ if (IS_ERR(op) || (op == NULL)) {
+ done_carry_pool(pool);
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+ }
+
+ /* these are permanent during insert_flow */
+ data = (reiser4_item_data *) (lowest_level + 3);
+ data->user = 1;
+ data->iplug = item_plugin_by_id(FORMATTING_ID);
+ data->arg = NULL;
+ /* data.length and data.data will be set before calling paste or
+ insert */
+ data->length = 0;
+ data->data = NULL;
+
+ op->u.insert_flow.flags = 0;
+ op->u.insert_flow.insert_point = coord;
+ op->u.insert_flow.flow = f;
+ op->u.insert_flow.data = data;
+ op->u.insert_flow.new_nodes = 0;
+
+ lowest_level->track_type = CARRY_TRACK_CHANGE;
+ lowest_level->tracked = lh;
+
+ result = reiser4_carry(lowest_level, NULL);
+ done_carry_pool(pool);
+
+ return result;
+}
+
+/* Given a coord in parent node, obtain a znode for the corresponding child */
+znode *child_znode(const coord_t * parent_coord /* coord of pointer to
+ * child */ ,
+ znode * parent /* parent of child */ ,
+ int incore_p /* if !0 only return child if already in
+ * memory */ ,
+ int setup_dkeys_p /* if !0 update delimiting keys of
+ * child */ )
+{
+ znode *child;
+
+ assert("nikita-1374", parent_coord != NULL);
+ assert("nikita-1482", parent != NULL);
+#if REISER4_DEBUG
+ if (setup_dkeys_p)
+ assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
+#endif
+ assert("nikita-2947", znode_is_any_locked(parent));
+
+ if (znode_get_level(parent) <= LEAF_LEVEL) {
+ /* trying to get child of leaf node */
+ warning("nikita-1217", "Child of maize?");
+ return ERR_PTR(RETERR(-EIO));
+ }
+ if (item_is_internal(parent_coord)) {
+ reiser4_block_nr addr;
+ item_plugin *iplug;
+ struct reiser4_subvol *subv;
+
+ iplug = item_plugin_by_coord(parent_coord);
+ assert("vs-512", iplug->s.internal.down_link);
+ iplug->s.internal.down_link(parent_coord, NULL, &addr);
+
+ subv = znode_get_subvol(parent);
+ if (incore_p)
+ child = zlook(&subv->tree, &addr);
+ else
+ child = zget(subv, &addr, parent,
+ znode_get_level(parent) - 1,
+ reiser4_ctx_gfp_mask_get());
+ if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
+ set_child_delimiting_keys(parent, parent_coord, child);
+ } else {
+ warning("nikita-1483", "Internal item expected");
+ child = ERR_PTR(RETERR(-EIO));
+ }
+ return child;
+}
+
+/* remove znode from transaction */
+static void uncapture_znode(znode *node)
+{
+ struct page *page;
+
+ assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
+
+ if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
+ int ret;
+ /*
+ * An already allocated block goes right to the atom's
+ * delete set
+ */
+ ret = reiser4_dealloc_block(znode_get_block(node), 0,
+ BA_DEFER | BA_FORMATTED,
+ znode_get_subvol(node));
+ if (ret)
+ warning("zam-942",
+ "can\'t add a block (%llu) number to atom's delete set\n",
+ (unsigned long long)(*znode_get_block(node)));
+
+ spin_lock_znode(node);
+ /*
+ * Here we return flush reserved block which was reserved
+ * at the moment when this allocated node was marked dirty
+ * and still not used by flush in node relocation procedure
+ */
+ if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
+ txn_atom *atom;
+ atom_brick_info *abi;
+ ctx_brick_info *cbi;
+
+ atom = jnode_get_atom(ZJNODE(node));
+ assert("zam-939", atom != NULL);
+
+ spin_unlock_znode(node);
+
+ cbi = find_context_brick_info(get_current_context(),
+ znode_get_subvol(node)->id);
+ assert("edward-2005", cbi != NULL);
+
+ abi = find_atom_brick_info(&atom->bricks_info,
+ znode_get_subvol(node)->id);
+ assert("edward-2006", abi != NULL);
+
+ flush_reserved2grabbed(abi, cbi,
+ (__u64) 1,
+ znode_get_subvol(node));
+ spin_unlock_atom(atom);
+ } else
+ spin_unlock_znode(node);
+ } else {
+ /*
+ * znode has assigned block which is counted as "fake
+ * allocated". Return it back to "free blocks"
+ */
+ fake_allocated2free((__u64) 1,
+ BA_FORMATTED, znode_get_subvol(node));
+ }
+ /*
+ * uncapture page from transaction. There is a possibility of a race
+ * with ->releasepage(): reiser4_releasepage() detaches page from this
+ * jnode and we have nothing to uncapture. To avoid this, get
+ * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
+ * will deal with released page itself.
+ */
+ spin_lock_znode(node);
+ page = znode_page(node);
+ if (likely(page != NULL)) {
+ /*
+ * reiser4_uncapture_page() can only be called when we are sure
+ * that znode is pinned in memory, which we are, because
+ * forget_znode() is only called from longterm_unlock_znode().
+ */
+ get_page(page);
+ spin_unlock_znode(node);
+ lock_page(page);
+ reiser4_uncapture_page(page);
+ unlock_page(page);
+ put_page(page);
+ } else {
+ txn_atom *atom;
+
+ /* handle "flush queued" znodes */
+ while (1) {
+ atom = jnode_get_atom(ZJNODE(node));
+ assert("zam-943", atom != NULL);
+
+ if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
+ || !atom->nr_running_queues)
+ break;
+
+ spin_unlock_znode(node);
+ reiser4_atom_wait_event(atom);
+ spin_lock_znode(node);
+ }
+
+ reiser4_uncapture_block(ZJNODE(node));
+ spin_unlock_atom(atom);
+ zput(node);
+ }
+}
+
+/* This is called from longterm_unlock_znode() when last lock is released from
+ the node that has been removed from the tree. At this point node is removed
+ from sibling list and its lock is invalidated. */
+void forget_znode(lock_handle * handle)
+{
+ znode *node;
+ reiser4_tree *tree;
+
+ assert("umka-319", handle != NULL);
+
+ node = handle->node;
+ tree = znode_get_tree(node);
+
+ assert("vs-164", znode_is_write_locked(node));
+ assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
+ assert_rw_locked(&(node->lock.guard));
+
+ /* We assume that this node was detached from its parent before
+ * unlocking, it gives no way to reach this node from parent through a
+ * down link. The node should have no children and, thereby, can't be
+ * reached from them by their parent pointers. The only way to obtain a
+ * reference to the node is to use sibling pointers from its left and
+ * right neighbors. In the next several lines we remove the node from
+ * the sibling list. */
+
+ write_lock_tree();
+ sibling_list_remove(node);
+ znode_remove(node);
+ write_unlock_tree();
+
+ /* Here we set JNODE_DYING and cancel all pending lock requests. It
+ * forces all lock requestor threads to repeat iterations of getting
+ * lock on a child, neighbor or parent node. But, those threads can't
+ * come to this node again, because this node is no longer a child,
+ * neighbor or parent of any other node. This order of znode
+ * invalidation does not allow other threads to waste cpu time is a busy
+ * loop, trying to lock dying object. The exception is in the flush
+ * code when we take node directly from atom's capture list.*/
+ reiser4_invalidate_lock(handle);
+ uncapture_znode(node);
+}
+
+/* Check that internal item at @pointer really contains pointer to @child. */
+int check_tree_pointer(const coord_t * pointer /* would-be pointer to
+ * @child */ ,
+ const znode * child /* child znode */ )
+{
+ assert("nikita-1016", pointer != NULL);
+ assert("nikita-1017", child != NULL);
+ assert("nikita-1018", pointer->node != NULL);
+
+ assert("nikita-1325", znode_is_any_locked(pointer->node));
+
+ assert("nikita-2985",
+ znode_get_level(pointer->node) == znode_get_level(child) + 1);
+
+ coord_clear_iplug((coord_t *) pointer);
+
+ if (coord_is_existing_unit(pointer)) {
+ item_plugin *iplug;
+ reiser4_block_nr addr;
+
+ if (item_is_internal(pointer)) {
+ iplug = item_plugin_by_coord(pointer);
+ assert("vs-513", iplug->s.internal.down_link);
+ iplug->s.internal.down_link(pointer, NULL, &addr);
+ /* check that cached value is correct */
+ if (disk_addr_eq(&addr, znode_get_block(child))) {
+ return NS_FOUND;
+ }
+ }
+ }
+ /* warning ("jmacd-1002", "tree pointer incorrect"); */
+ return NS_NOT_FOUND;
+}
+
+/* find coord of pointer to new @child in @parent.
+
+ Find the &coord_t in the @parent where pointer to a given @child will
+ be in.
+
+*/
+int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
+ znode *
+ child UNUSED_ARG /* child znode, passed locked */ ,
+ znode * left /* left brother of new node */ ,
+ coord_t * result /* where result is stored in */ )
+{
+ int ret;
+
+ assert("nikita-1486", parent != NULL);
+ assert("nikita-1487", child != NULL);
+ assert("nikita-1488", result != NULL);
+
+ ret = find_child_ptr(parent, left, result);
+ if (ret != NS_FOUND) {
+ warning("nikita-1489", "Cannot find brother position: %i", ret);
+ return RETERR(-EIO);
+ } else {
+ result->between = AFTER_UNIT;
+ return RETERR(NS_NOT_FOUND);
+ }
+}
+
+/* find coord of pointer to @child in @parent.
+
+ Find the &coord_t in the @parent where pointer to a given @child is in.
+
+*/
+int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
+ znode * child /* child znode, passed locked */ ,
+ coord_t * result /* where result is stored in */ )
+{
+ int lookup_res;
+ node_plugin *nplug;
+ /* left delimiting key of a child */
+ reiser4_key ld;
+ reiser4_tree *tree;
+
+ assert("nikita-934", parent != NULL);
+ assert("nikita-935", child != NULL);
+ assert("nikita-936", result != NULL);
+ assert("zam-356", znode_is_loaded(parent));
+
+ coord_init_zero(result);
+ result->node = parent;
+
+ nplug = parent->nplug;
+ assert("nikita-939", nplug != NULL);
+
+ tree = znode_get_tree(parent);
+ /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
+ * not aliased to ->in_parent of some znode. Otherwise,
+ * parent_coord_to_coord() below would modify data protected by tree
+ * lock. */
+ read_lock_tree();
+ /* fast path. Try to use cached value. Lock tree to keep
+ node->pos_in_parent and pos->*_blocknr consistent. */
+ if (child->in_parent.item_pos + 1 != 0) {
+ parent_coord_to_coord(&child->in_parent, result);
+ if (check_tree_pointer(result, child) == NS_FOUND) {
+ read_unlock_tree();
+ return NS_FOUND;
+ }
+
+ child->in_parent.item_pos = (unsigned short)~0;
+ }
+ read_unlock_tree();
+
+ /* is above failed, find some key from @child. We are looking for the
+ least key in a child. */
+ read_lock_dk(tree);
+ ld = *znode_get_ld_key(child);
+ read_unlock_dk(tree);
+ /*
+ * now, lookup parent with key just found. Note, that left delimiting
+ * key doesn't identify node uniquely, because (in extremely rare
+ * case) two nodes can have equal left delimiting keys, if one of them
+ * is completely filled with directory entries that all happened to be
+ * hash collision. But, we check block number in check_tree_pointer()
+ * and, so, are safe.
+ */
+ lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
+ /* update cached pos_in_node */
+ if (lookup_res == NS_FOUND) {
+ write_lock_tree();
+ coord_to_parent_coord(result, &child->in_parent);
+ write_unlock_tree();
+ lookup_res = check_tree_pointer(result, child);
+ }
+ if (lookup_res == NS_NOT_FOUND)
+ lookup_res = find_child_by_addr(parent, child, result);
+ return lookup_res;
+}
+
+/* find coord of pointer to @child in @parent by scanning
+
+ Find the &coord_t in the @parent where pointer to a given @child
+ is in by scanning all internal items in @parent and comparing block
+ numbers in them with that of @child.
+
+*/
+static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
+ znode * child /* child znode, passed locked */ ,
+ coord_t * result /* where result is stored in */ )
+{
+ int ret;
+
+ assert("nikita-1320", parent != NULL);
+ assert("nikita-1321", child != NULL);
+ assert("nikita-1322", result != NULL);
+
+ ret = NS_NOT_FOUND;
+
+ for_all_units(result, parent) {
+ if (check_tree_pointer(result, child) == NS_FOUND) {
+ write_lock_tree();
+ coord_to_parent_coord(result, &child->in_parent);
+ write_unlock_tree();
+ ret = NS_FOUND;
+ break;
+ }
+ }
+ return ret;
+}
+
+/* true, if @addr is "unallocated block number", which is just address, with
+ highest bit set. */
+int is_disk_addr_unallocated(const reiser4_block_nr * addr /* address to
+ * check */ )
+{
+ assert("nikita-1766", addr != NULL);
+
+ return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
+ REISER4_UNALLOCATED_STATUS_VALUE;
+}
+
+/* helper function for prepare_twig_kill(): @left and @right are formatted
+ * neighbors of extent item being completely removed. Load and lock neighbors
+ * and store lock handles into @cdata for later use by kill_hook_extent() */
+static int
+prepare_children(znode * left, znode * right, carry_kill_data * kdata)
+{
+ int result;
+ int left_loaded;
+ int right_loaded;
+
+ result = 0;
+ left_loaded = right_loaded = 0;
+
+ if (left != NULL) {
+ result = zload(left);
+ if (result == 0) {
+ left_loaded = 1;
+ result = longterm_lock_znode(kdata->left, left,
+ ZNODE_READ_LOCK,
+ ZNODE_LOCK_LOPRI);
+ }
+ }
+ if (result == 0 && right != NULL) {
+ result = zload(right);
+ if (result == 0) {
+ right_loaded = 1;
+ result = longterm_lock_znode(kdata->right, right,
+ ZNODE_READ_LOCK,
+ ZNODE_LOCK_HIPRI |
+ ZNODE_LOCK_NONBLOCK);
+ }
+ }
+ if (result != 0) {
+ done_lh(kdata->left);
+ done_lh(kdata->right);
+ if (left_loaded != 0)
+ zrelse(left);
+ if (right_loaded != 0)
+ zrelse(right);
+ }
+ return result;
+}
+
+static void done_children(carry_kill_data * kdata)
+{
+ if (kdata->left != NULL && kdata->left->node != NULL) {
+ zrelse(kdata->left->node);
+ done_lh(kdata->left);
+ }
+ if (kdata->right != NULL && kdata->right->node != NULL) {
+ zrelse(kdata->right->node);
+ done_lh(kdata->right);
+ }
+}
+
+/**
+ * returns true if removing bytes of given range of key [from_key, to_key]
+ * causes removing of whole item @from
+ */
+static int item_removed_completely(coord_t *from,
+ const reiser4_key *from_key,
+ const reiser4_key *to_key)
+{
+ reiser4_key key_in_item;
+
+ assert("umka-325", from != NULL);
+ assert("edward-2093", item_is_extent(from));
+
+ /* check first unit */
+ item_key_by_coord(from, &key_in_item);
+ if (keygt(from_key, &key_in_item))
+ /* first byte is not removed */
+ return 0;
+
+ /* check last key */
+ max_item_key_by_coord(from, &key_in_item);
+ if (keylt(to_key, &key_in_item))
+ /* last byte is not removed */
+ return 0;
+ return 1;
+}
+
+/* part of cut_node. It is called when cut_node is called to remove or cut part
+ of extent item. When head of that item is removed - we have to update right
+ delimiting of left neighbor of extent. When item is removed completely - we
+ have to set sibling link between left and right neighbor of removed
+ extent. This may return -E_DEADLOCK because of trying to get left neighbor
+ locked. So, caller should repeat an attempt
+*/
+/* Audited by: umka (2002.06.16) */
+static int
+prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
+{
+ int result;
+ reiser4_key key;
+ lock_handle left_lh;
+ lock_handle right_lh;
+ coord_t left_coord;
+ coord_t *from;
+ znode *left_child;
+ znode *right_child;
+ reiser4_tree *tree;
+ int left_zloaded_here, right_zloaded_here;
+
+ from = kdata->params.from;
+ assert("umka-326", from != NULL);
+ assert("umka-327", kdata->params.to != NULL);
+
+ assert("vs-591", item_is_extent(from));
+ assert("vs-592", ergo(item_id_by_coord(from) == EXTENT40_POINTER_ID,
+ from->item_pos == kdata->params.to->item_pos));
+
+ if ((kdata->params.from_key
+ && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
+ || from->unit_pos != 0) {
+ /* head of item @from is not removed, there is nothing to
+ worry about */
+ return 0;
+ }
+
+ result = 0;
+ left_zloaded_here = 0;
+ right_zloaded_here = 0;
+
+ left_child = right_child = NULL;
+
+ coord_dup(&left_coord, from);
+ init_lh(&left_lh);
+ init_lh(&right_lh);
+ if (coord_prev_unit(&left_coord)) {
+ /* @from is leftmost item in its node */
+ if (!locked_left_neighbor) {
+ result =
+ reiser4_get_left_neighbor(&left_lh, from->node,
+ ZNODE_READ_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ switch (result) {
+ case 0:
+ break;
+ case -E_NO_NEIGHBOR:
+ /* there is no formatted node to the left of
+ from->node */
+ warning("vs-605",
+ "extent item has smallest key in "
+ "the tree and it is about to be removed");
+ return 0;
+ case -E_DEADLOCK:
+ /* need to restart */
+ default:
+ return result;
+ }
+
+ /* we have acquired left neighbor of from->node */
+ result = zload(left_lh.node);
+ if (result)
+ goto done;
+
+ locked_left_neighbor = left_lh.node;
+ } else {
+ /* squalloc_right_twig_cut should have supplied locked
+ * left neighbor */
+ assert("vs-834",
+ znode_is_write_locked(locked_left_neighbor));
+ result = zload(locked_left_neighbor);
+ if (result)
+ return result;
+ }
+
+ left_zloaded_here = 1;
+ coord_init_last_unit(&left_coord, locked_left_neighbor);
+ }
+
+ if (!item_is_internal(&left_coord)) {
+ /* what else but extent can be on twig level */
+ assert("vs-606", item_is_extent(&left_coord));
+
+ /* there is no left formatted child */
+ if (left_zloaded_here)
+ zrelse(locked_left_neighbor);
+ done_lh(&left_lh);
+ return 0;
+ }
+
+ tree = znode_get_tree(left_coord.node);
+ left_child = child_znode(&left_coord, left_coord.node, 1, 0);
+
+ if (IS_ERR(left_child)) {
+ result = PTR_ERR(left_child);
+ goto done;
+ }
+ /*
+ * left child is acquired, calculate new right delimiting
+ * key for it and get right child if it is necessary
+ */
+ if (item_removed_completely(from,
+ kdata->params.from_key,
+ kdata->params.to_key)) {
+ /*
+ * try to get right child of removed item
+ */
+ coord_t right_coord;
+
+ assert("vs-607",
+ kdata->params.to->unit_pos ==
+ coord_last_unit_pos(kdata->params.to));
+ coord_dup(&right_coord, kdata->params.to);
+ if (coord_next_unit(&right_coord)) {
+ /* @to is rightmost unit in the node */
+ result =
+ reiser4_get_right_neighbor(&right_lh, from->node,
+ ZNODE_READ_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ switch (result) {
+ case 0:
+ result = zload(right_lh.node);
+ if (result)
+ goto done;
+
+ right_zloaded_here = 1;
+ coord_init_first_unit(&right_coord,
+ right_lh.node);
+ item_key_by_coord(&right_coord, &key);
+ break;
+
+ case -E_NO_NEIGHBOR:
+ /* there is no formatted node to the right of
+ from->node */
+ read_lock_dk(tree);
+ key = *znode_get_rd_key(from->node);
+ read_unlock_dk(tree);
+ right_coord.node = NULL;
+ result = 0;
+ break;
+ default:
+ /* real error */
+ goto done;
+ }
+ } else {
+ /* there is an item to the right of @from - take its key */
+ item_key_by_coord(&right_coord, &key);
+ }
+
+ /* try to get right child of @from */
+ if (right_coord.node && /* there is right neighbor of @from */
+ item_is_internal(&right_coord)) { /* it is internal item */
+ right_child = child_znode(&right_coord,
+ right_coord.node, 1, 0);
+
+ if (IS_ERR(right_child)) {
+ result = PTR_ERR(right_child);
+ goto done;
+ }
+
+ }
+ /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
+ update of right delimiting key of left_child */
+ result = prepare_children(left_child, right_child, kdata);
+ } else {
+ /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
+ result = prepare_children(left_child, NULL, kdata);
+ }
+
+ done:
+ if (right_child)
+ zput(right_child);
+ if (right_zloaded_here)
+ zrelse(right_lh.node);
+ done_lh(&right_lh);
+
+ if (left_child)
+ zput(left_child);
+ if (left_zloaded_here)
+ zrelse(locked_left_neighbor);
+ done_lh(&left_lh);
+ return result;
+}
+
+/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
+ are to be cut completely */
+/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key, /* first key to be removed */
+ const reiser4_key * to_key, /* last key to be removed */
+ reiser4_key *
+ smallest_removed /* smallest key actually removed */ )
+{
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ carry_cut_data *cut_data;
+ carry_op *op;
+
+ assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
+
+ pool =
+ init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(*cut_data));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+
+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
+ assert("vs-1509", op != 0);
+ if (IS_ERR(op)) {
+ done_carry_pool(pool);
+ return PTR_ERR(op);
+ }
+
+ cut_data = (carry_cut_data *) (lowest_level + 3);
+ cut_data->params.from = from;
+ cut_data->params.to = to;
+ cut_data->params.from_key = from_key;
+ cut_data->params.to_key = to_key;
+ cut_data->params.smallest_removed = smallest_removed;
+
+ op->u.cut_or_kill.is_cut = 1;
+ op->u.cut_or_kill.u.cut = cut_data;
+
+ result = reiser4_carry(lowest_level, NULL);
+ done_carry_pool(pool);
+
+ return result;
+}
+
+/* cut part of the node
+
+ Cut part or whole content of node.
+
+ cut data between @from and @to of @from->node and call carry() to make
+ corresponding changes in the tree. @from->node may become empty. If so -
+ pointer to it will be removed. Neighboring nodes are not changed. Smallest
+ removed key is stored in @smallest_removed
+
+*/
+int kill_node_content(coord_t * from, /* coord of the first unit/item that will be eliminated */
+ coord_t * to, /* coord of the last unit/item that will be eliminated */
+ const reiser4_key * from_key, /* first key to be removed */
+ const reiser4_key * to_key, /* last key to be removed */
+ reiser4_key * smallest_removed, /* smallest key actually removed */
+ znode * locked_left_neighbor, /* this is set when kill_node_content is called with left neighbor
+ * locked (in squalloc_right_twig_cut, namely) */
+ struct inode *inode, /* inode of file whose item (or its part) is to be killed. This is necessary to
+ invalidate pages together with item pointing to them */
+ int truncate)
+{ /* this call is made for file truncate) */
+ int result;
+ carry_pool *pool;
+ carry_level *lowest_level;
+ carry_kill_data *kdata;
+ lock_handle *left_child;
+ lock_handle *right_child;
+ carry_op *op;
+
+ assert("umka-328", from != NULL);
+ assert("vs-316", !node_is_empty(from->node));
+ assert("nikita-1812", coord_is_existing_unit(from)
+ && coord_is_existing_unit(to));
+
+ /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
+ sizeof(carry_kill_data) +
+ 2 * sizeof(lock_handle) +
+ 5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ lowest_level = (carry_level *) (pool + 1);
+ init_carry_level(lowest_level, pool);
+
+ kdata = (carry_kill_data *) (lowest_level + 3);
+ left_child = (lock_handle *) (kdata + 1);
+ right_child = left_child + 1;
+
+ init_lh(left_child);
+ init_lh(right_child);
+
+ kdata->params.from = from;
+ kdata->params.to = to;
+ kdata->params.from_key = from_key;
+ kdata->params.to_key = to_key;
+ kdata->params.smallest_removed = smallest_removed;
+ kdata->params.truncate = truncate;
+ kdata->flags = 0;
+ kdata->inode = inode;
+ kdata->left = left_child;
+ kdata->right = right_child;
+ /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
+ kdata->buf = (char *)(right_child + 1);
+
+ if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
+ /* left child of extent item may have to get updated right
+ delimiting key and to get linked with right child of extent
+ @from if it will be removed completely */
+ result = prepare_twig_kill(kdata, locked_left_neighbor);
+ if (result) {
+ done_children(kdata);
+ done_carry_pool(pool);
+ return result;
+ }
+ }
+
+ op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
+ if (IS_ERR(op) || (op == NULL)) {
+ done_children(kdata);
+ done_carry_pool(pool);
+ return RETERR(op ? PTR_ERR(op) : -EIO);
+ }
+
+ op->u.cut_or_kill.is_cut = 0;
+ op->u.cut_or_kill.u.kill = kdata;
+
+ result = reiser4_carry(lowest_level, NULL);
+
+ done_children(kdata);
+ done_carry_pool(pool);
+ return result;
+}
+
+void
+fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
+{
+ if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
+ pgoff_t start_pg, end_pg;
+
+ start_pg = start >> PAGE_SHIFT;
+ end_pg = (end - 1) >> PAGE_SHIFT;
+
+ if ((start & (PAGE_SIZE - 1)) == 0) {
+ /*
+ * kill up to the page boundary.
+ */
+ assert("vs-123456", start_pg == end_pg);
+ reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
+ truncate);
+ } else if (start_pg != end_pg) {
+ /*
+ * page boundary is within killed portion of node.
+ */
+ assert("vs-654321", end_pg - start_pg == 1);
+ reiser4_invalidate_pages(inode->i_mapping, end_pg,
+ end_pg - start_pg, 1);
+ }
+ }
+ inode_sub_bytes(inode, end - start);
+}
+
+/**
+ * Delete whole @node from the reiser4 tree without loading it.
+ *
+ * @left: locked left neighbor,
+ * @node: node to be deleted,
+ * @smallest_removed: leftmost key of deleted node,
+ * @object: inode pointer, if we truncate a file body.
+ * @truncate: true if called for file truncate.
+ *
+ * @return: 0 if success, error code otherwise.
+ *
+ * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
+ * contains the right value of the smallest removed key from the previous
+ * cut_worker() iteration. This is needed for proper accounting of
+ * "i_blocks" and "i_bytes" fields of the @object.
+ */
+int reiser4_delete_node(znode *node, reiser4_key *smallest_removed,
+ struct inode *object, int truncate)
+{
+ lock_handle parent_lock;
+ coord_t cut_from;
+ coord_t cut_to;
+ reiser4_tree *tree;
+ int ret;
+
+ assert("zam-937", node != NULL);
+ assert("zam-933", znode_is_write_locked(node));
+ assert("zam-999", smallest_removed != NULL);
+
+ init_lh(&parent_lock);
+
+ ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
+ if (ret)
+ return ret;
+
+ assert("zam-934", !znode_above_root(parent_lock.node));
+
+ ret = zload(parent_lock.node);
+ if (ret)
+ goto failed_nozrelse;
+
+ ret = find_child_ptr(parent_lock.node, node, &cut_from);
+ if (ret)
+ goto failed;
+
+ /* decrement child counter and set parent pointer to NULL before
+ deleting the list from parent node because of checks in
+ internal_kill_item_hook (we can delete the last item from the parent
+ node, the parent node is going to be deleted and its c_count should
+ be zero). */
+
+ tree = znode_get_tree(node);
+ write_lock_tree();
+ init_parent_coord(&node->in_parent, NULL);
+ --parent_lock.node->c_count;
+ write_unlock_tree();
+
+ assert("zam-989", item_is_internal(&cut_from));
+
+ /* @node should be deleted after unlocking. */
+ ZF_SET(node, JNODE_HEARD_BANSHEE);
+
+ /* remove a pointer from the parent node to the node being deleted. */
+ coord_dup(&cut_to, &cut_from);
+ /* FIXME: shouldn't this be kill_node_content */
+ ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
+ if (ret)
+ /* FIXME(Zam): Should we re-connect the node to its parent if
+ * cut_node fails? */
+ goto failed;
+
+ {
+ __u64 start_offset = 0, end_offset = 0;
+
+ read_lock_tree();
+ write_lock_dk(tree);
+ if (object) {
+ /* We use @smallest_removed and the left delimiting of
+ * the current node for @object->i_blocks, i_bytes
+ * calculation. We assume that the items after the
+ * *@smallest_removed key have been deleted from the
+ * file body. */
+ start_offset = get_key_offset(znode_get_ld_key(node));
+ end_offset = get_key_offset(smallest_removed);
+ }
+
+ assert("zam-1021", znode_is_connected(node));
+ if (node->left)
+ znode_set_rd_key(node->left, znode_get_rd_key(node));
+
+ *smallest_removed = *znode_get_ld_key(node);
+
+ write_unlock_dk(tree);
+ read_unlock_tree();
+
+ if (object) {
+ /* we used to perform actions which are to be performed on items on their removal from tree in
+ special item method - kill_hook. Here for optimization reasons we avoid reading node
+ containing item we remove and can not call item's kill hook. Instead we call function which
+ does exactly the same things as tail kill hook in assumption that node we avoid reading
+ contains only one item and that item is a tail one. */
+ fake_kill_hook_tail(object, start_offset, end_offset,
+ truncate);
+ }
+ }
+ failed:
+ zrelse(parent_lock.node);
+ failed_nozrelse:
+ done_lh(&parent_lock);
+
+ return ret;
+}
+
+static int can_delete(const reiser4_key *key, znode *node)
+{
+ int result;
+ reiser4_tree *tree = znode_get_tree(node);
+
+ read_lock_dk(tree);
+ result = keyle(key, znode_get_ld_key(node));
+ read_unlock_dk(tree);
+ return result;
+}
+
+/**
+ * This subroutine is not optimal but implementation seems to
+ * be easier).
+ *
+ * @tap: the point deletion process begins from,
+ * @from_key: the beginning of the deleted key range,
+ * @to_key: the end of the deleted key range,
+ * @smallest_removed: the smallest removed key,
+ * @truncate: true if called for file truncate.
+ * @progress: return true if a progress in file items deletions was made,
+ * @smallest_removed value is actual in that case.
+ *
+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long
+ * reiser4_cut_tree operation was interrupted for allowing atom commit.
+ */
+int
+cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed, struct inode *object,
+ int truncate, int *progress)
+{
+ lock_handle next_node_lock;
+ coord_t left_coord;
+ int result;
+
+ assert("zam-931", tap->coord->node != NULL);
+ assert("zam-932", znode_is_write_locked(tap->coord->node));
+
+ *progress = 0;
+ init_lh(&next_node_lock);
+
+ while (1) {
+ znode *node; /* node from which items are cut */
+ node_plugin *nplug; /* node plugin for @node */
+
+ node = tap->coord->node;
+
+ /* Move next_node_lock to the next node on the left. */
+ result =
+ reiser4_get_left_neighbor(&next_node_lock, node,
+ ZNODE_WRITE_LOCK,
+ GN_CAN_USE_UPPER_LEVELS);
+ if (result != 0 && result != -E_NO_NEIGHBOR)
+ break;
+ /* Check can we delete the node as a whole. */
+ if (*progress && znode_get_level(node) == LEAF_LEVEL &&
+ can_delete(from_key, node)) {
+ result = reiser4_delete_node(node, smallest_removed,
+ object, truncate);
+ } else {
+ result = reiser4_tap_load(tap);
+ if (result)
+ return result;
+
+ /* Prepare the second (right) point for cut_node() */
+ if (*progress)
+ coord_init_last_unit(tap->coord, node);
+
+ else if (item_plugin_by_coord(tap->coord)->b.lookup ==
+ NULL)
+ /* set rightmost unit for the items without lookup method */
+ tap->coord->unit_pos =
+ coord_last_unit_pos(tap->coord);
+
+ nplug = node->nplug;
+
+ assert("vs-686", nplug);
+ assert("vs-687", nplug->lookup);
+
+ /* left_coord is leftmost unit cut from @node */
+ result = nplug->lookup(node, from_key,
+ FIND_MAX_NOT_MORE_THAN,
+ &left_coord);
+
+ if (IS_CBKERR(result))
+ break;
+
+ /* adjust coordinates so that they are set to existing units */
+ if (coord_set_to_right(&left_coord)
+ || coord_set_to_left(tap->coord)) {
+ result = 0;
+ break;
+ }
+
+ if (coord_compare(&left_coord, tap->coord) ==
+ COORD_CMP_ON_RIGHT) {
+ /* keys from @from_key to @to_key are not in the tree */
+ result = 0;
+ break;
+ }
+
+ if (left_coord.item_pos != tap->coord->item_pos) {
+ /* do not allow to cut more than one item. It is added to solve problem of truncating
+ partially converted files. If file is partially converted there may exist a twig node
+ containing both internal item or items pointing to leaf nodes with formatting items
+ and extent item. We do not want to kill internal items being at twig node here
+ because cut_tree_worker assumes killing them from level level */
+ coord_dup(&left_coord, tap->coord);
+ assert("vs-1652",
+ coord_is_existing_unit(&left_coord));
+ left_coord.unit_pos = 0;
+ }
+
+ /* cut data from one node */
+ /* *smallest_removed = *reiser4_min_key(); */
+ result =
+ kill_node_content(&left_coord, tap->coord, from_key,
+ to_key, smallest_removed,
+ next_node_lock.node, object,
+ truncate);
+ reiser4_tap_relse(tap);
+ }
+ if (result)
+ break;
+
+ ++(*progress);
+
+ /* Check whether all items with keys >= from_key were removed
+ * from the tree. */
+ if (keyle(smallest_removed, from_key))
+ /* result = 0; */
+ break;
+
+ if (next_node_lock.node == NULL)
+ break;
+
+ result = reiser4_tap_move(tap, &next_node_lock);
+ done_lh(&next_node_lock);
+ if (result)
+ break;
+
+ /* Break long reiser4_cut_tree operation (deletion of a large
+ file) if atom requires commit. */
+ if (*progress > CUT_TREE_MIN_ITERATIONS
+ && current_atom_should_commit()) {
+ result = -E_REPEAT;
+ break;
+ }
+ }
+ done_lh(&next_node_lock);
+ /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */
+ return result;
+}
+
+/* there is a fundamental problem with optimizing deletes: VFS does it
+ one file at a time. Another problem is that if an item can be
+ anything, then deleting items must be done one at a time. It just
+ seems clean to writes this to specify a from and a to key, and cut
+ everything between them though. */
+
+/* use this function with care if deleting more than what is part of a single file. */
+/* do not use this when cutting a single item, it is suboptimal for that */
+
+/* You are encouraged to write plugin specific versions of this. It
+ cannot be optimal for all plugins because it works item at a time,
+ and some plugins could sometimes work node at a time. Regular files
+ however are not optimizable to work node at a time because of
+ extents needing to free the blocks they point to.
+
+ Optimizations compared to v3 code:
+
+ It does not balance (that task is left to memory pressure code).
+
+ Nodes are deleted only if empty.
+
+ Uses extents.
+
+ Performs read-ahead of formatted nodes whose contents are part of
+ the deletion.
+*/
+
+/**
+ * Delete everything from the reiser4 tree between two keys: @from_key and
+ * @to_key.
+ *
+ * @from_key: the beginning of the deleted key range,
+ * @to_key: the end of the deleted key range,
+ * @smallest_removed: the smallest removed key,
+ * @object: owner of cutting items.
+ * @truncate: true if called for file truncate.
+ * @progress: return true if a progress in file items deletions was made,
+ * @smallest_removed value is actual in that case.
+ *
+ * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
+ * operation was interrupted for allowing atom commit .
+ */
+
+int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed_p,
+ struct inode *object, int truncate, int *progress)
+{
+ lock_handle lock;
+ int result;
+ tap_t tap;
+ coord_t right_coord;
+ reiser4_key smallest_removed;
+ int (*cut_tree_worker) (tap_t *, const reiser4_key *,
+ const reiser4_key *, reiser4_key *,
+ struct inode *, int, int *);
+ STORE_COUNTERS;
+
+ assert("umka-329", tree != NULL);
+ assert("umka-330", from_key != NULL);
+ assert("umka-331", to_key != NULL);
+ assert("zam-936", keyle(from_key, to_key));
+
+ if (smallest_removed_p == NULL)
+ smallest_removed_p = &smallest_removed;
+
+ init_lh(&lock);
+
+ do {
+ /* Find rightmost item to cut away from the tree. */
+ result = reiser4_object_lookup(tree,
+ object, to_key, &right_coord,
+ &lock, ZNODE_WRITE_LOCK,
+ FIND_MAX_NOT_MORE_THAN,
+ TWIG_LEVEL, LEAF_LEVEL,
+ CBK_UNIQUE, NULL /*ra_info */);
+ if (result != CBK_COORD_FOUND)
+ break;
+ if (object == NULL
+ || inode_file_plugin(object)->cut_tree_worker == NULL)
+ cut_tree_worker = cut_tree_worker_common;
+ else
+ cut_tree_worker =
+ inode_file_plugin(object)->cut_tree_worker;
+ reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
+ result =
+ cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
+ object, truncate, progress);
+ reiser4_tap_done(&tap);
+
+ reiser4_preempt_point();
+
+ } while (0);
+
+ done_lh(&lock);
+
+ if (result) {
+ switch (result) {
+ case -E_DEADLOCK:
+ result = -E_REPEAT;
+ case -E_NO_NEIGHBOR:
+ case -E_REPEAT:
+ case -ENOMEM:
+ case -ENOENT:
+ break;
+ default:
+ warning("nikita-2861", "failure: %i", result);
+ }
+ }
+
+ CHECK_COUNTERS;
+ return result;
+}
+
+/* repeat reiser4_cut_tree_object until everything is deleted.
+ * unlike cut_file_items, it does not end current transaction if -E_REPEAT
+ * is returned by cut_tree_object. */
+int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
+ const reiser4_key * to, struct inode *inode, int truncate)
+{
+ int result;
+ int progress;
+
+ do {
+ result = reiser4_cut_tree_object(tree, from, to, NULL,
+ inode, truncate, &progress);
+ if (result == -E_NO_NEIGHBOR)
+ result = 0;
+ } while (result == -E_REPEAT);
+
+ return result;
+}
+
+/**
+ * Update item key and respectively delimiting keys on the upper
+ * levels (if needed).
+ *
+ * @target: item, whose key needs to be updated
+ * @key: new value of the key
+ */
+int update_item_key(coord_t *target, const reiser4_key *key)
+{
+ znode *node;
+ carry_pool *pool;
+ carry_level *todo;
+ carry_plugin_info info;
+
+ pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+ todo = (carry_level *) (pool + 1);
+ init_carry_level(todo, pool);
+
+ info.doing = NULL;
+ info.todo = todo;
+
+ node = target->node;
+ node_plugin_by_node(node)->update_item_key(target, key, &info);
+
+ if (target->item_pos == 0) {
+ int ret;
+ reiser4_tree *tree;
+
+ /*
+ * ->update_item_key() could post COP_UPDATE
+ */
+ ret = reiser4_carry(todo, NULL /* previous level */);
+ if (ret) {
+ done_carry_pool(pool);
+ return ret;
+ }
+ tree = znode_get_tree(node);
+ read_lock_tree();
+ write_lock_dk(tree);
+
+ znode_set_ld_key(node, key);
+ if (znode_is_left_connected(node) && node->left)
+ znode_set_rd_key(node->left, key);
+
+ write_unlock_dk(tree);
+ read_unlock_tree();
+ }
+ znode_make_dirty(node);
+
+ done_carry_pool(pool);
+ return 0;
+}
+
+int reiser4_subvol_init_tree(struct reiser4_subvol *subv,
+ const reiser4_block_nr *root_block,
+ tree_level height, node_plugin *nplug)
+{
+ int result;
+ reiser4_tree *tree = &subv->tree;
+
+ assert("nikita-307", root_block != NULL);
+ assert("nikita-308", height > 0);
+ assert("nikita-309", nplug != NULL);
+ assert("edward-171", get_current_context() != NULL);
+ /*
+ * We'll perform costly memory allocations for znode hash table, etc.
+ * So, set proper allocation flags
+ */
+ get_current_context()->gfp_mask |= (__GFP_NOWARN);
+
+ tree->subvol = subv;
+ /*
+ * Set default tree options (came from init_super)
+ */
+ tree->cbk_cache.nr_slots = CBK_CACHE_SLOTS;
+ tree->carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
+ tree->carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
+ tree->carry.paste_flags = REISER4_PASTE_FLAGS;
+ tree->carry.insert_flags = REISER4_INSERT_FLAGS;
+
+ spin_lock_init(&(tree->epoch_lock));
+
+ tree->root_block = *root_block;
+ tree->height = height;
+ tree->estimate_one_insert = calc_estimate_one_insert(height);
+ tree->nplug = nplug;
+
+ tree->znode_epoch = 1ull;
+
+ cbk_cache_init(&tree->cbk_cache);
+
+ result = znodes_tree_init(tree);
+ if (result == 0) {
+ tree->uber = zget(subv, &UBER_TREE_ADDR, NULL, 0,
+ reiser4_ctx_gfp_mask_get());
+ if (IS_ERR(tree->uber)) {
+ result = PTR_ERR(tree->uber);
+ tree->uber = NULL;
+ }
+ }
+ return result;
+}
+
+/* release resources associated with @tree */
+void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
+{
+ if (tree == NULL)
+ return;
+
+ if (tree->uber != NULL) {
+ zput(tree->uber);
+ tree->uber = NULL;
+ }
+ znodes_tree_done(tree);
+ cbk_cache_done(&tree->cbk_cache);
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree.h linux-5.10.2/fs/reiser4/tree.h
--- linux-5.10.2.orig/fs/reiser4/tree.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree.h 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,497 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Tree operations. See fs/reiser4/tree.c for comments */
+
+#if !defined( __REISER4_TREE_H__ )
+#define __REISER4_TREE_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "plugin/node/node.h"
+#include "plugin/plugin.h"
+#include "znode.h"
+#include "tap.h"
+
+#include <linux/types.h> /* for __u?? */
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/spinlock.h>
+#include <linux/sched.h> /* for struct task_struct */
+
+/* fictive block number never actually used */
+extern const reiser4_block_nr UBER_TREE_ADDR;
+
+/* &cbk_cache_slot - entry in a coord cache.
+
+ This is entry in a coord_by_key (cbk) cache, represented by
+ &cbk_cache.
+
+*/
+typedef struct cbk_cache_slot {
+ /* cached node */
+ znode *node;
+ /* linkage to the next cbk cache slot in a LRU order */
+ struct list_head lru;
+} cbk_cache_slot;
+
+/* &cbk_cache - coord cache. This is part of reiser4_tree.
+
+ cbk_cache is supposed to speed up tree lookups by caching results of recent
+ successful lookups (we don't cache negative results as dentry cache
+ does). Cache consists of relatively small number of entries kept in a LRU
+ order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
+ which we can obtain a range of keys that covered by this znode. Before
+ embarking into real tree traversal we scan cbk_cache slot by slot and for
+ each slot check whether key we are looking for is between minimal and
+ maximal keys for node pointed to by this slot. If no match is found, real
+ tree traversal is performed and if result is successful, appropriate entry
+ is inserted into cache, possibly pulling least recently used entry out of
+ it.
+
+ Tree spin lock is used to protect coord cache. If contention for this
+ lock proves to be too high, more finer grained locking can be added.
+
+ Invariants involving parts of this data-type:
+
+ [cbk-cache-invariant]
+*/
+typedef struct cbk_cache {
+ /* serializator */
+ rwlock_t guard;
+ int nr_slots;
+ /* head of LRU list of cache slots */
+ struct list_head lru;
+ /* actual array of slots */
+ cbk_cache_slot *slot;
+} cbk_cache;
+
+/* level_lookup_result - possible outcome of looking up key at some level.
+ This is used by coord_by_key when traversing tree downward. */
+typedef enum {
+ /* continue to the next level */
+ LOOKUP_CONT,
+ /* done. Either required item was found, or we can prove it
+ doesn't exist, or some error occurred. */
+ LOOKUP_DONE,
+ /* restart traversal from the root. Infamous "repetition". */
+ LOOKUP_REST
+} level_lookup_result;
+
+/* This is representation of internal reiser4 tree where all file-system
+ data and meta-data are stored. This structure is passed to all tree
+ manipulation functions. It's different from the super block because:
+ we don't want to limit ourselves to strictly one to one mapping
+ between super blocks and trees, and, because they are logically
+ different: there are things in a super block that have no relation to
+ the tree (bitmaps, journalling area, mount options, etc.) and there
+ are things in a tree that bear no relation to the super block, like
+ tree of znodes.
+
+ At this time, there is only one tree
+ per filesystem, and this struct is part of the super block. We only
+ call the super block the super block for historical reasons (most
+ other filesystems call the per filesystem metadata the super block).
+*/
+
+struct reiser4_tree {
+ /* block_nr == 0 is fake znode. Write lock it, while changing
+ tree height. */
+ /* disk address of root node of a tree */
+ reiser4_block_nr root_block;
+
+ /* level of the root node. If this is 1, tree consists of root
+ node only */
+ tree_level height;
+
+ /*
+ * this is cached here avoid calling plugins through function
+ * dereference all the time.
+ */
+ __u64 estimate_one_insert;
+
+ /* cache of recent tree lookup results */
+ cbk_cache cbk_cache;
+
+ /* hash table to look up znodes by block number. */
+ z_hash_table zhash_table;
+ z_hash_table zfake_table;
+ /* lock protecting delimiting keys */
+ rwlock_t dk_lock;
+
+ /* spin lock protecting znode_epoch */
+ spinlock_t epoch_lock;
+ /* version stamp used to mark znode updates. See seal.[ch] for more
+ * information. */
+ __u64 znode_epoch;
+
+ znode *uber;
+ node_plugin *nplug;
+ reiser4_subvol *subvol;
+ struct {
+ /* carry flags used for insertion of new nodes */
+ __u32 new_node_flags;
+ /* carry flags used for insertion of new extents */
+ __u32 new_extent_flags;
+ /* carry flags used for paste operations */
+ __u32 paste_flags;
+ /* carry flags used for insert operations */
+ __u32 insert_flags;
+ } carry;
+};
+
+extern int reiser4_subvol_init_tree(struct reiser4_subvol *subvol,
+ const reiser4_block_nr *root_block,
+ tree_level height, node_plugin *nplug);
+extern void reiser4_done_tree(reiser4_tree * tree);
+
+/* cbk flags: options for coord_by_key() */
+typedef enum {
+ /* coord_by_key() is called for insertion. This is necessary because
+ of extents being located at the twig level. For explanation, see
+ comment just above is_next_item_internal().
+ */
+ CBK_FOR_INSERT = (1 << 0),
+ /* coord_by_key() is called with key that is known to be unique */
+ CBK_UNIQUE = (1 << 1),
+ /* coord_by_key() can trust delimiting keys. This options is not user
+ accessible. coord_by_key() will set it automatically. It will be
+ only cleared by special-case in extents-on-the-twig-level handling
+ where it is necessary to insert item with a key smaller than
+ leftmost key in a node. This is necessary because of extents being
+ located at the twig level. For explanation, see comment just above
+ is_next_item_internal().
+ */
+ CBK_TRUST_DK = (1 << 2),
+ CBK_READA = (1 << 3), /* original: readahead leaves which contain items of certain file */
+ CBK_READDIR_RA = (1 << 4), /* readdir: readahead whole directory and all its stat datas */
+ CBK_DKSET = (1 << 5),
+ CBK_EXTENDED_COORD = (1 << 6), /* coord_t is actually */
+ CBK_IN_CACHE = (1 << 7), /* node is already in cache */
+ CBK_USE_CRABLOCK = (1 << 8) /* use crab_lock in stead of long term
+ * lock */
+} cbk_flags;
+
+/* insertion outcome. IBK = insert by key */
+typedef enum {
+ IBK_INSERT_OK = 0,
+ IBK_ALREADY_EXISTS = -EEXIST,
+ IBK_IO_ERROR = -EIO,
+ IBK_NO_SPACE = -E_NODE_FULL,
+ IBK_OOM = -ENOMEM
+} insert_result;
+
+#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
+
+typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
+ lock_handle * lh, void *arg);
+extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
+ lock_handle * lh,
+ tree_iterate_actor_t actor, void *arg,
+ znode_lock_mode mode, int through_units_p);
+extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
+ znode_lock_request pri, lock_handle * lh);
+
+/* return node plugin of @node */
+static inline node_plugin *node_plugin_by_node(const znode *
+ node /* node to query */ )
+{
+ assert("vs-213", node != NULL);
+ assert("vs-214", znode_is_loaded(node));
+
+ return node->nplug;
+}
+
+/* number of items in @node */
+static inline pos_in_node_t node_num_items(const znode * node)
+{
+ assert("nikita-2754", znode_is_loaded(node));
+ assert("nikita-2468",
+ node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
+
+ return node->nr_items;
+}
+
+/* Return the number of items at the present node. Asserts coord->node !=
+ NULL. */
+static inline unsigned coord_num_items(const coord_t * coord)
+{
+ assert("jmacd-9805", coord->node != NULL);
+
+ return node_num_items(coord->node);
+}
+
+/* true if @node is empty */
+static inline int node_is_empty(const znode * node)
+{
+ return node_num_items(node) == 0;
+}
+
+typedef enum {
+ SHIFTED_SOMETHING = 0,
+ SHIFT_NO_SPACE = -E_NODE_FULL,
+ SHIFT_IO_ERROR = -EIO,
+ SHIFT_OOM = -ENOMEM,
+} shift_result;
+
+extern node_plugin *node_plugin_by_coord(const coord_t * coord);
+extern int is_coord_in_node(const coord_t * coord);
+extern int key_in_node(const reiser4_key *, const coord_t *);
+extern void coord_item_move_to(coord_t * coord, int items);
+extern void coord_unit_move_to(coord_t * coord, int units);
+
+/* there are two types of repetitive accesses (ra): intra-syscall
+ (local) and inter-syscall (global). Local ra is used when
+ during single syscall we add/delete several items and units in the
+ same place in a tree. Note that plan-A fragments local ra by
+ separating stat-data and file body in key-space. Global ra is
+ used when user does repetitive modifications in the same place in a
+ tree.
+
+ Our ra implementation serves following purposes:
+ 1 it affects balancing decisions so that next operation in a row
+ can be performed faster;
+ 2 it affects lower-level read-ahead in page-cache;
+ 3 it allows to avoid unnecessary lookups by maintaining some state
+ across several operations (this is only for local ra);
+ 4 it leaves room for lazy-micro-balancing: when we start a sequence of
+ operations they are performed without actually doing any intra-node
+ shifts, until we finish sequence or scope of sequence leaves
+ current node, only then we really pack node (local ra only).
+*/
+
+/* another thing that can be useful is to keep per-tree and/or
+ per-process cache of recent lookups. This cache can be organised as a
+ list of block numbers of formatted nodes sorted by starting key in
+ this node. Balancings should invalidate appropriate parts of this
+ cache.
+*/
+
+lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
+ coord_t * coord, lock_handle * handle,
+ znode_lock_mode lock, lookup_bias bias,
+ tree_level lock_level, tree_level stop_level,
+ __u32 flags, ra_info_t *);
+
+lookup_result reiser4_object_lookup(reiser4_tree *tree,
+ struct inode *object,
+ const reiser4_key * key,
+ coord_t * coord,
+ lock_handle * lh,
+ znode_lock_mode lock_mode,
+ lookup_bias bias,
+ tree_level lock_level,
+ tree_level stop_level,
+ __u32 flags, ra_info_t * info);
+
+insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
+ reiser4_item_data * data, coord_t * coord,
+ lock_handle * lh,
+ tree_level stop_level, __u32 flags);
+insert_result insert_by_coord(coord_t * coord,
+ reiser4_item_data * data, const reiser4_key * key,
+ lock_handle * lh, __u32);
+insert_result insert_extent_by_coord(coord_t * coord,
+ reiser4_item_data * data,
+ const reiser4_key * key, lock_handle * lh);
+int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
+ const reiser4_key * to_key,
+ reiser4_key * smallest_removed);
+int kill_node_content(coord_t * from, coord_t * to,
+ const reiser4_key * from_key, const reiser4_key * to_key,
+ reiser4_key * smallest_removed,
+ znode * locked_left_neighbor, struct inode *inode,
+ int truncate);
+
+int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
+ reiser4_key * key, lock_handle * lh, cop_insert_flag);
+int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
+ reiser4_item_data * data, unsigned);
+int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
+int find_new_child_ptr(znode * parent, znode * child, znode * left,
+ coord_t * result);
+
+int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
+int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
+
+void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
+
+extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
+ const reiser4_key *, reiser4_key *,
+ struct inode *, int, int *);
+extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
+ const reiser4_key *, reiser4_key *,
+ struct inode *, int, int *);
+extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
+ const reiser4_key * to, struct inode *, int);
+
+extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
+extern int check_tree_pointer(const coord_t * pointer, const znode * child);
+extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
+ znode * left, coord_t * result);
+extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
+extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
+ znode * child);
+extern znode *child_znode(const coord_t * in_parent, znode * parent,
+ int incore_p, int setup_dkeys_p);
+
+extern int cbk_cache_init(cbk_cache * cache);
+extern void cbk_cache_done(cbk_cache * cache);
+extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
+
+extern char *sprint_address(const reiser4_block_nr * block);
+
+#if REISER4_DEBUG
+extern void print_coord_content(const char *prefix, coord_t * p);
+extern void reiser4_print_address(const char *prefix,
+ const reiser4_block_nr * block);
+extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
+ __u32 flags);
+extern void check_dkeys(znode *node);
+#else
+#define print_coord_content(p, c) noop
+#define reiser4_print_address(p, b) noop
+#endif
+
+extern void forget_znode(lock_handle * handle);
+extern int deallocate_znode(znode * node);
+
+extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
+
+/* struct used internally to pack all numerous arguments of tree lookup.
+ Used to avoid passing a lot of arguments to helper functions. */
+typedef struct cbk_handle {
+ /* tree we are in */
+ reiser4_tree *tree;
+ /* key we are going after */
+ const reiser4_key *key;
+ /* coord we will store result in */
+ coord_t *coord;
+ /* type of lock to take on target node */
+ znode_lock_mode lock_mode;
+ /* lookup bias. See comments at the declaration of lookup_bias */
+ lookup_bias bias;
+ /* lock level: level starting from which tree traversal starts taking
+ * write locks. */
+ tree_level lock_level;
+ /* level where search will stop. Either item will be found between
+ lock_level and stop_level, or CBK_COORD_NOTFOUND will be
+ returned.
+ */
+ tree_level stop_level;
+ /* level we are currently at */
+ tree_level level;
+ /* block number of @active node. Tree traversal operates on two
+ nodes: active and parent. */
+ reiser4_block_nr block;
+ /* put here error message to be printed by caller */
+ const char *error;
+ /* result passed back to caller */
+ int result;
+ /* lock handles for active and parent */
+ lock_handle *parent_lh;
+ lock_handle *active_lh;
+ reiser4_key ld_key;
+ reiser4_key rd_key;
+ /* flags, passed to the cbk routine. Bits of this bitmask are defined
+ in tree.h:cbk_flags enum. */
+ __u32 flags;
+ ra_info_t *ra_info;
+ struct inode *object;
+} cbk_handle;
+
+extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
+
+/* eottl.c */
+extern int handle_eottl(cbk_handle *h, int *outcome);
+
+int lookup_multikey(cbk_handle * handle, int nr_keys);
+int lookup_couple(reiser4_tree * tree,
+ const reiser4_key * key1, const reiser4_key * key2,
+ coord_t * coord1, coord_t * coord2,
+ lock_handle * lh1, lock_handle * lh2,
+ znode_lock_mode lock_mode, lookup_bias bias,
+ tree_level lock_level, tree_level stop_level, __u32 flags,
+ int *result1, int *result2);
+
+static inline void read_lock_dk(reiser4_tree *tree)
+{
+ /* check that dk is not locked */
+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(read_locked_dk) &&
+ LOCK_CNT_NIL(write_locked_dk)));
+ /* check that spinlocks of lower priorities are not held */
+ assert("", LOCK_CNT_NIL(spin_locked_stack));
+
+ read_lock(&((tree)->dk_lock));
+
+ LOCK_CNT_INC(read_locked_dk);
+ LOCK_CNT_INC(rw_locked_dk);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void read_unlock_dk(reiser4_tree *tree)
+{
+ assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(read_locked_dk);
+ LOCK_CNT_DEC(rw_locked_dk);
+ LOCK_CNT_DEC(spin_locked);
+
+ read_unlock(&(tree->dk_lock));
+}
+
+static inline void write_lock_dk(reiser4_tree *tree)
+{
+ /* check that dk is not locked */
+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(read_locked_dk) &&
+ LOCK_CNT_NIL(write_locked_dk)));
+ /* check that spinlocks of lower priorities are not held */
+ assert("", LOCK_CNT_NIL(spin_locked_stack));
+
+ write_lock(&((tree)->dk_lock));
+
+ LOCK_CNT_INC(write_locked_dk);
+ LOCK_CNT_INC(rw_locked_dk);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void write_unlock_dk(reiser4_tree *tree)
+{
+ assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
+ assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(write_locked_dk);
+ LOCK_CNT_DEC(rw_locked_dk);
+ LOCK_CNT_DEC(spin_locked);
+
+ write_unlock(&(tree->dk_lock));
+}
+
+/* estimate api. Implementation is in estimate.c */
+reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
+reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
+reiser4_block_nr estimate_insert_flow(tree_level);
+reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
+reiser4_block_nr calc_estimate_one_insert(tree_level);
+reiser4_block_nr estimate_dirty_cluster(struct inode *);
+reiser4_block_nr estimate_insert_cluster(struct inode *);
+reiser4_block_nr estimate_update_cluster(struct inode *);
+
+/* __REISER4_TREE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree_mod.c linux-5.10.2/fs/reiser4/tree_mod.c
--- linux-5.10.2.orig/fs/reiser4/tree_mod.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree_mod.c 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,391 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/*
+ * Functions to add/delete new nodes to/from the tree.
+ *
+ * Functions from this file are used by carry (see carry*) to handle:
+ *
+ * . insertion of new formatted node into tree
+ *
+ * . addition of new tree root, increasing tree height
+ *
+ * . removing tree root, decreasing tree height
+ *
+ */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/plugin.h"
+#include "jnode.h"
+#include "znode.h"
+#include "tree_mod.h"
+#include "block_alloc.h"
+#include "tree_walk.h"
+#include "tree.h"
+#include "super.h"
+
+#include <linux/err.h>
+
+static int add_child_ptr(znode * parent, znode * child);
+/* warning only issued if error is not -E_REPEAT */
+#define ewarning( error, ... ) \
+ if( ( error ) != -E_REPEAT ) \
+ warning( __VA_ARGS__ )
+
+/*
+ * allocate new node on the @level and immediately on the right of @brother
+ */
+znode *reiser4_new_node(znode *brother, /* existing left neighbor of new node */
+ tree_level level /* tree level at which new node is to
+ * be allocated */)
+{
+ znode *result;
+ int retcode;
+ reiser4_subvol *subv;
+ reiser4_block_nr blocknr;
+
+ assert("nikita-930", brother != NULL);
+ assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
+
+ subv = znode_get_subvol(brother);
+ assert("edward-1735", subv != NULL);
+
+ retcode = assign_fake_blocknr_formatted(&blocknr, subv);
+ if (retcode == 0) {
+ result = zget(subv, &blocknr, NULL, level,
+ reiser4_ctx_gfp_mask_get());
+ if (IS_ERR(result)) {
+ ewarning(PTR_ERR(result), "nikita-929",
+ "Cannot allocate znode for carry: %li",
+ PTR_ERR(result));
+ return result;
+ }
+ /* cheap test, can be executed even when debugging is off */
+ if (!znode_just_created(result)) {
+ warning("nikita-2213",
+ "Allocated already existing block: %llu",
+ (unsigned long long)blocknr);
+ zput(result);
+ return ERR_PTR(RETERR(-EIO));
+ }
+
+ assert("nikita-931", result != NULL);
+ result->nplug = znode_get_tree(brother)->nplug;
+ assert("nikita-933", result->nplug != NULL);
+
+ retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
+ if (retcode == 0) {
+ ZF_SET(result, JNODE_CREATED);
+ zrelse(result);
+ } else {
+ zput(result);
+ result = ERR_PTR(retcode);
+ }
+ } else {
+ /* failure to allocate new node during balancing.
+ This should never happen. Ever. Returning -E_REPEAT
+ is not viable solution, because "out of disk space"
+ is not transient error that will go away by itself.
+ */
+ ewarning(retcode, "nikita-928",
+ "Cannot allocate block for carry: %i", retcode);
+ result = ERR_PTR(retcode);
+ }
+ assert("nikita-1071", result != NULL);
+ return result;
+}
+
+/* allocate new root and add it to the tree
+
+ This helper function is called by add_new_root().
+
+*/
+znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
+ znode * fake /* "fake" znode */ )
+{
+ reiser4_tree *tree = znode_get_tree(old_root);
+ znode *new_root = NULL; /* to shut gcc up */
+ int result;
+
+ assert("nikita-1069", old_root != NULL);
+ assert("umka-262", fake != NULL);
+ assert("umka-263", tree != NULL);
+
+ /* "fake" znode---one always hanging just above current root. This
+ node is locked when new root is created or existing root is
+ deleted. Downward tree traversal takes lock on it before taking
+ lock on a root node. This avoids race conditions with root
+ manipulations.
+
+ */
+ assert("nikita-1348", znode_above_root(fake));
+ assert("nikita-1211", znode_is_root(old_root));
+
+ result = 0;
+ if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
+ warning("nikita-1344", "Tree is too tall: %i", tree->height);
+ /* ext2 returns -ENOSPC when it runs out of free inodes with a
+ following comment (fs/ext2/ialloc.c:441): Is it really
+ ENOSPC?
+
+ -EXFULL? -EINVAL?
+ */
+ result = RETERR(-ENOSPC);
+ } else {
+ /* Allocate block for new root. It's not that
+ important where it will be allocated, as root is
+ almost always in memory. Moreover, allocate on
+ flush can be going here.
+ */
+ assert("nikita-1448", znode_is_root(old_root));
+ new_root = reiser4_new_node(fake, tree->height + 1);
+ if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
+ lock_handle rlh;
+
+ init_lh(&rlh);
+ result =
+ longterm_lock_znode(&rlh, new_root,
+ ZNODE_WRITE_LOCK,
+ ZNODE_LOCK_LOPRI);
+ if (result == 0) {
+ parent_coord_t *in_parent;
+
+ znode_make_dirty(fake);
+
+ /* new root is a child of "fake" node */
+ write_lock_tree();
+
+ ++tree->height;
+
+ /* recalculate max balance overhead */
+ tree->estimate_one_insert =
+ calc_estimate_one_insert(tree->height);
+
+ tree->root_block = *znode_get_block(new_root);
+ in_parent = &new_root->in_parent;
+ init_parent_coord(in_parent, fake);
+ /* manually insert new root into sibling
+ * list. With this all nodes involved into
+ * balancing are connected after balancing is
+ * done---useful invariant to check. */
+ sibling_list_insert_nolock(new_root, NULL);
+ write_unlock_tree();
+
+ /* insert into new root pointer to the
+ @old_root. */
+ assert("nikita-1110",
+ WITH_DATA(new_root,
+ node_is_empty(new_root)));
+ write_lock_dk(tree);
+ znode_set_ld_key(new_root, reiser4_min_key());
+ znode_set_rd_key(new_root, reiser4_max_key());
+ write_unlock_dk(tree);
+ if (REISER4_DEBUG) {
+ ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
+ ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
+ ZF_SET(old_root, JNODE_ORPHAN);
+ }
+ result = add_child_ptr(new_root, old_root);
+ done_lh(&rlh);
+ }
+ zrelse(new_root);
+ }
+ }
+ if (result != 0)
+ new_root = ERR_PTR(result);
+ return new_root;
+}
+
+/* build &reiser4_item_data for inserting child pointer
+
+ Build &reiser4_item_data that can be later used to insert pointer to @child
+ in its parent.
+
+*/
+void build_child_ptr_data(znode * child /* node pointer to which will be
+ * inserted */ ,
+ reiser4_item_data * data /* where to store result */ )
+{
+ assert("nikita-1116", child != NULL);
+ assert("nikita-1117", data != NULL);
+
+ /*
+ * NOTE: use address of child's blocknr as address of data to be
+ * inserted. As result of this data gets into on-disk structure in cpu
+ * byte order. internal's create_hook converts it to little endian byte
+ * order.
+ */
+ data->data = (char *)znode_get_block(child);
+ /* data -> data is kernel space */
+ data->user = 0;
+ data->length = sizeof(reiser4_block_nr);
+ /* FIXME-VS: hardcoded internal item? */
+
+ /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
+ data->iplug = item_plugin_by_id(NODE_POINTER_ID);
+}
+
+/* add pointer to @child into empty @parent.
+
+ This is used when pointer to old root is inserted into new root which is
+ empty.
+*/
+static int add_child_ptr(znode * parent, znode * child)
+{
+ coord_t coord;
+ reiser4_item_data data;
+ int result;
+ reiser4_key key;
+
+ assert("nikita-1111", parent != NULL);
+ assert("nikita-1112", child != NULL);
+ assert("nikita-1115",
+ znode_get_level(parent) == znode_get_level(child) + 1);
+
+ result = zload(parent);
+ if (result != 0)
+ return result;
+ assert("nikita-1113", node_is_empty(parent));
+ coord_init_first_unit(&coord, parent);
+
+ build_child_ptr_data(child, &data);
+ data.arg = NULL;
+
+ read_lock_dk(znode_get_tree(parent));
+ key = *znode_get_ld_key(child);
+ read_unlock_dk(znode_get_tree(parent));
+
+ result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
+ NULL);
+ znode_make_dirty(parent);
+ zrelse(parent);
+ return result;
+}
+
+/* actually remove tree root */
+static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
+ * being removed */,
+ znode * old_root /* root node that is being
+ * removed */ ,
+ znode * new_root /* new root---sole child of
+ * @old_root */,
+ const reiser4_block_nr * new_root_blk /* disk address of
+ * @new_root */)
+{
+ znode *uber;
+ int result;
+ lock_handle handle_for_uber;
+
+ assert("umka-265", tree != NULL);
+ assert("nikita-1198", new_root != NULL);
+ assert("nikita-1199",
+ znode_get_level(new_root) + 1 == znode_get_level(old_root));
+
+ assert("nikita-1201", znode_is_write_locked(old_root));
+
+ assert("nikita-1203",
+ disk_addr_eq(new_root_blk, znode_get_block(new_root)));
+
+ init_lh(&handle_for_uber);
+ /* obtain and lock "fake" znode protecting changes in tree height. */
+ result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
+ &handle_for_uber);
+ if (result == 0) {
+ uber = handle_for_uber.node;
+
+ znode_make_dirty(uber);
+
+ /* don't take long term lock a @new_root. Take spinlock. */
+
+ write_lock_tree();
+
+ tree->root_block = *new_root_blk;
+ --tree->height;
+
+ /* recalculate max balance overhead */
+ tree->estimate_one_insert =
+ calc_estimate_one_insert(tree->height);
+
+ assert("nikita-1202",
+ tree->height == znode_get_level(new_root));
+
+ /* new root is child on "fake" node */
+ init_parent_coord(&new_root->in_parent, uber);
+ ++uber->c_count;
+
+ /* sibling_list_insert_nolock(new_root, NULL); */
+ write_unlock_tree();
+
+ /* reinitialise old root. */
+ result = init_znode(ZJNODE(old_root));
+ znode_make_dirty(old_root);
+ if (result == 0) {
+ assert("nikita-1279", node_is_empty(old_root));
+ ZF_SET(old_root, JNODE_HEARD_BANSHEE);
+ old_root->c_count = 0;
+ }
+ }
+ done_lh(&handle_for_uber);
+
+ return result;
+}
+
+/* remove tree root
+
+ This function removes tree root, decreasing tree height by one. Tree root
+ and its only child (that is going to become new tree root) are write locked
+ at the entry.
+
+ To remove tree root we need to take lock on special "fake" znode that
+ protects changes of tree height. See comments in reiser4_add_tree_root() for
+ more on this.
+
+ Also parent pointers have to be updated in
+ old and new root. To simplify code, function is split into two parts: outer
+ reiser4_kill_tree_root() collects all necessary arguments and calls
+ reiser4_kill_root() to do the actual job.
+
+*/
+int reiser4_kill_tree_root(znode * old_root /* tree root that we are
+ removing*/)
+{
+ int result;
+ coord_t down_link;
+ znode *new_root;
+ reiser4_tree *tree;
+
+ assert("edward-1736", znode_get_subvol(old_root) != NULL);
+ assert("nikita-1194", old_root != NULL);
+ assert("nikita-1196", znode_is_root(old_root));
+ assert("nikita-1200", node_num_items(old_root) == 1);
+ assert("nikita-1401", znode_is_write_locked(old_root));
+
+ coord_init_first_unit(&down_link, old_root);
+
+ tree = znode_get_tree(old_root);
+ new_root = child_znode(&down_link, old_root, 0, 1);
+ if (!IS_ERR(new_root)) {
+ result =
+ reiser4_kill_root(tree, old_root, new_root,
+ znode_get_block(new_root));
+ zput(new_root);
+ } else
+ result = PTR_ERR(new_root);
+
+ return result;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree_mod.h linux-5.10.2/fs/reiser4/tree_mod.h
--- linux-5.10.2.orig/fs/reiser4/tree_mod.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree_mod.h 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,29 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
+ * comments. */
+
+#if !defined( __REISER4_TREE_MOD_H__ )
+#define __REISER4_TREE_MOD_H__
+
+#include "forward.h"
+
+znode *reiser4_new_node(znode * brother, tree_level level);
+znode *reiser4_add_tree_root(znode * old_root, znode * fake);
+int reiser4_kill_tree_root(znode * old_root);
+void build_child_ptr_data(znode * child, reiser4_item_data * data);
+
+/* __REISER4_TREE_MOD_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree_walk.c linux-5.10.2/fs/reiser4/tree_walk.c
--- linux-5.10.2.orig/fs/reiser4/tree_walk.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree_walk.c 2020-12-23 16:07:46.135813378 +0100
@@ -0,0 +1,922 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Routines and macros to:
+
+ get_left_neighbor()
+
+ get_right_neighbor()
+
+ get_parent()
+
+ get_first_child()
+
+ get_last_child()
+
+ various routines to walk the whole tree and do things to it like
+ repack it, or move it to tertiary storage. Please make them as
+ generic as is reasonable.
+
+*/
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "jnode.h"
+#include "znode.h"
+#include "tree_walk.h"
+#include "tree.h"
+#include "super.h"
+
+/* These macros are used internally in tree_walk.c in attempt to make
+ lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
+ lock_left_neighbor */
+#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
+#define FIELD_OFFSET(name) offsetof(znode, name)
+#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
+#define LEFT_PTR_OFFSET FIELD_OFFSET(left)
+#define RIGHT_PTR_OFFSET FIELD_OFFSET(right)
+
+/* This is the generic procedure to get and lock `generic' neighbor (left or
+ right neighbor or parent). It implements common algorithm for all cases of
+ getting lock on neighbor node, only znode structure field is different in
+ each case. This is parameterized by ptr_offset argument, which is byte
+ offset for the pointer to the desired neighbor within the current node's
+ znode structure. This function should be called with the tree lock held */
+static int lock_neighbor(
+ /* resulting lock handle */
+ lock_handle * result,
+ /* znode to lock */
+ znode * node,
+ /* pointer to neighbor (or parent) znode field offset, in bytes from
+ the base address of znode structure */
+ int ptr_offset,
+ /* lock mode for longterm_lock_znode call */
+ znode_lock_mode mode,
+ /* lock request for longterm_lock_znode call */
+ znode_lock_request req,
+ /* GN_* flags */
+ int flags, int rlocked)
+{
+ int ret;
+ znode *neighbor;
+
+ assert("umka-236", node != NULL);
+ assert("umka-237", znode_get_tree(node) != NULL);
+ assert_rw_locked(&(znode_get_tree(node)->tree_lock));
+
+ if (flags & GN_TRY_LOCK)
+ req |= ZNODE_LOCK_NONBLOCK;
+ if (flags & GN_SAME_ATOM)
+ req |= ZNODE_LOCK_DONT_FUSE;
+
+ /* get neighbor's address by using of sibling link, quit while loop
+ (and return) if link is not available. */
+ while (1) {
+ neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
+
+ /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
+ * node pointed by it is not connected.
+ *
+ * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
+ * check and allows passing reference to not connected znode to
+ * subsequent longterm_lock_znode() call. This kills possible
+ * busy loop if we are trying to get longterm lock on locked but
+ * not yet connected parent node. */
+ if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
+ || znode_is_connected(neighbor))) {
+ return RETERR(-E_NO_NEIGHBOR);
+ }
+
+ /* protect it from deletion. */
+ zref(neighbor);
+
+ rlocked ? read_unlock_tree() : write_unlock_tree();
+
+ ret = longterm_lock_znode(result, neighbor, mode, req);
+
+ /* The lock handle obtains its own reference, release the one from above. */
+ zput(neighbor);
+
+ rlocked ? read_lock_tree() : write_lock_tree();
+
+ /* restart if node we got reference to is being
+ invalidated. we should not get reference to this node
+ again. */
+ if (ret == -EINVAL)
+ continue;
+ if (ret)
+ return ret;
+
+ /* check if neighbor link still points to just locked znode;
+ the link could have been changed while the process slept. */
+ if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
+ return 0;
+
+ /* znode was locked by mistake; unlock it and restart locking
+ process from beginning. */
+ rlocked ? read_unlock_tree() : write_unlock_tree();
+ longterm_unlock_znode(result);
+ rlocked ? read_lock_tree() : write_lock_tree();
+ }
+}
+
+/* get parent node with longterm lock, accepts GN* flags. */
+int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
+ znode * node /* child node */ ,
+ znode_lock_mode mode
+ /* type of lock: read or write */ ,
+ int flags /* GN_* flags */ )
+{
+ int result;
+
+ read_lock_tree();
+ result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
+ ZNODE_LOCK_HIPRI, flags, 1);
+ read_unlock_tree();
+ return result;
+}
+
+/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
+ bit in @flags parameter */
+/* Audited by: umka (2002.06.14) */
+static inline int
+lock_side_neighbor(lock_handle * result,
+ znode * node, znode_lock_mode mode, int flags, int rlocked)
+{
+ int ret;
+ int ptr_offset;
+ znode_lock_request req;
+
+ if (flags & GN_GO_LEFT) {
+ ptr_offset = LEFT_PTR_OFFSET;
+ req = ZNODE_LOCK_LOPRI;
+ } else {
+ ptr_offset = RIGHT_PTR_OFFSET;
+ req = ZNODE_LOCK_HIPRI;
+ }
+
+ ret =
+ lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
+
+ if (ret == -E_NO_NEIGHBOR) /* if we walk left or right -E_NO_NEIGHBOR does not
+ * guarantee that neighbor is absent in the
+ * tree; in this case we return -ENOENT --
+ * means neighbor at least not found in
+ * cache */
+ return RETERR(-ENOENT);
+
+ return ret;
+}
+
+#if REISER4_DEBUG
+
+int check_sibling_list(znode * node)
+{
+ znode *scan;
+ znode *next;
+
+ assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
+
+ if (node == NULL)
+ return 1;
+
+ if (ZF_ISSET(node, JNODE_RIP))
+ return 1;
+
+ assert("nikita-3270", node != NULL);
+ assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
+
+ for (scan = node; znode_is_left_connected(scan); scan = next) {
+ next = scan->left;
+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
+ assert("nikita-3271", znode_is_right_connected(next));
+ assert("nikita-3272", next->right == scan);
+ } else
+ break;
+ }
+ for (scan = node; znode_is_right_connected(scan); scan = next) {
+ next = scan->right;
+ if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
+ assert("nikita-3273", znode_is_left_connected(next));
+ assert("nikita-3274", next->left == scan);
+ } else
+ break;
+ }
+ return 1;
+}
+
+#endif
+
+/* Znode sibling pointers maintenence. */
+
+/* Znode sibling pointers are established between any neighbored nodes which are
+ in cache. There are two znode state bits (JNODE_LEFT_CONNECTED,
+ JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
+ value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
+
+ Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
+ take care about searching (hash table lookup may be required) of znode
+ neighbors, establishing sibling pointers between them and setting
+ JNODE_*_CONNECTED state bits. */
+
+/* adjusting of sibling pointers and `connected' states for two
+ neighbors; works if one neighbor is NULL (was not found). */
+
+/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
+void link_left_and_right(znode * left, znode * right)
+{
+ assert("nikita-3275", check_sibling_list(left));
+ assert("nikita-3275", check_sibling_list(right));
+
+ if (left != NULL) {
+ if (left->right == NULL) {
+ left->right = right;
+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
+
+ ON_DEBUG(left->right_version =
+ atomic_inc_return(&delim_key_version);
+ );
+
+ } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
+ && left->right != right) {
+
+ ON_DEBUG(left->right->left_version =
+ atomic_inc_return(&delim_key_version);
+ left->right_version =
+ atomic_inc_return(&delim_key_version););
+
+ left->right->left = NULL;
+ left->right = right;
+ ZF_SET(left, JNODE_RIGHT_CONNECTED);
+ } else
+ /*
+ * there is a race condition in renew_sibling_link()
+ * and assertions below check that it is only one
+ * there. Thread T1 calls renew_sibling_link() without
+ * GN_NO_ALLOC flag. zlook() doesn't find neighbor
+ * node, but before T1 gets to the
+ * link_left_and_right(), another thread T2 creates
+ * neighbor node and connects it. check for
+ * left->right == NULL above protects T1 from
+ * overwriting correct left->right pointer installed
+ * by T2.
+ */
+ assert("nikita-3302",
+ right == NULL || left->right == right);
+ }
+ if (right != NULL) {
+ if (right->left == NULL) {
+ right->left = left;
+ ZF_SET(right, JNODE_LEFT_CONNECTED);
+
+ ON_DEBUG(right->left_version =
+ atomic_inc_return(&delim_key_version);
+ );
+
+ } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
+ && right->left != left) {
+
+ ON_DEBUG(right->left->right_version =
+ atomic_inc_return(&delim_key_version);
+ right->left_version =
+ atomic_inc_return(&delim_key_version););
+
+ right->left->right = NULL;
+ right->left = left;
+ ZF_SET(right, JNODE_LEFT_CONNECTED);
+
+ } else
+ assert("nikita-3303",
+ left == NULL || right->left == left);
+ }
+ assert("nikita-3275", check_sibling_list(left));
+ assert("nikita-3275", check_sibling_list(right));
+}
+
+/* Audited by: umka (2002.06.14) */
+static void link_znodes(znode * first, znode * second, int to_left)
+{
+ if (to_left)
+ link_left_and_right(second, first);
+ else
+ link_left_and_right(first, second);
+}
+
+/* getting of next (to left or to right, depend on gn_to_left bit in flags)
+ coord's unit position in horizontal direction, even across node
+ boundary. Should be called under tree lock, it protects nonexistence of
+ sibling link on parent level, if lock_side_neighbor() fails with
+ -ENOENT. */
+static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
+{
+ int ret;
+ znode *node;
+ reiser4_tree *tree;
+
+ assert("umka-243", coord != NULL);
+ assert("umka-244", handle != NULL);
+ assert("zam-1069", handle->node == NULL);
+
+ ret =
+ (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
+ coord_next_unit(coord);
+ if (!ret)
+ return 0;
+
+ ret =
+ lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
+ if (ret)
+ return ret;
+
+ node = handle->node;
+ tree = znode_get_tree(node);
+ write_unlock_tree();
+
+ coord_init_zero(coord);
+
+ /* We avoid synchronous read here if it is specified by flag. */
+ if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
+ ret = jstartio(ZJNODE(handle->node));
+ if (!ret)
+ ret = -E_REPEAT;
+ goto error_locked;
+ }
+
+ /* corresponded zrelse() should be called by the clients of
+ far_next_coord(), in place when this node gets unlocked. */
+ ret = zload(handle->node);
+ if (ret)
+ goto error_locked;
+
+ if (flags & GN_GO_LEFT)
+ coord_init_last_unit(coord, node);
+ else
+ coord_init_first_unit(coord, node);
+
+ if (0) {
+ error_locked:
+ longterm_unlock_znode(handle);
+ }
+ write_lock_tree();
+ return ret;
+}
+
+/* Very significant function which performs a step in horizontal direction
+ when sibling pointer is not available. Actually, it is only function which
+ does it.
+ Note: this function does not restore locking status at exit,
+ caller should does care about proper unlocking and zrelsing */
+static int
+renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
+ tree_level level, int flags, int *nr_locked)
+{
+ int ret;
+ int to_left = flags & GN_GO_LEFT;
+ reiser4_block_nr da;
+ /* parent of the neighbor node; we set it to parent until not sharing
+ of one parent between child and neighbor node is detected */
+ znode *side_parent = coord->node;
+ reiser4_subvol *subv = ZJNODE(child)->subvol;
+ reiser4_tree *tree = znode_get_tree(child);
+ znode *neighbor = NULL;
+
+ assert("umka-245", coord != NULL);
+ assert("umka-246", handle != NULL);
+ assert("umka-247", child != NULL);
+ assert("umka-303", tree != NULL);
+
+ init_lh(handle);
+ write_lock_tree();
+ ret = far_next_coord(coord, handle, flags);
+
+ if (ret) {
+ if (ret != -ENOENT) {
+ write_unlock_tree();
+ return ret;
+ }
+ } else {
+ item_plugin *iplug;
+
+ if (handle->node != NULL) {
+ (*nr_locked)++;
+ side_parent = handle->node;
+ }
+
+ /* does coord object points to internal item? We do not
+ support sibling pointers between znode for formatted and
+ unformatted nodes and return -E_NO_NEIGHBOR in that case. */
+ iplug = item_plugin_by_coord(coord);
+ if (!item_is_internal(coord)) {
+ link_znodes(child, NULL, to_left);
+ write_unlock_tree();
+ /* we know there can't be formatted neighbor */
+ return RETERR(-E_NO_NEIGHBOR);
+ }
+ write_unlock_tree();
+
+ iplug->s.internal.down_link(coord, NULL, &da);
+
+ if (flags & GN_NO_ALLOC)
+ neighbor = zlook(tree, &da);
+ else
+ neighbor = zget(subv, &da, side_parent, level,
+ reiser4_ctx_gfp_mask_get());
+
+ if (IS_ERR(neighbor)) {
+ ret = PTR_ERR(neighbor);
+ return ret;
+ }
+
+ if (neighbor)
+ /* update delimiting keys */
+ set_child_delimiting_keys(coord->node, coord, neighbor);
+
+ write_lock_tree();
+ }
+
+ if (likely(neighbor == NULL ||
+ (znode_get_level(child) == znode_get_level(neighbor)
+ && child != neighbor)))
+ link_znodes(child, neighbor, to_left);
+ else {
+ warning("nikita-3532",
+ "Sibling nodes on the different levels: %i != %i\n",
+ znode_get_level(child), znode_get_level(neighbor));
+ ret = RETERR(-EIO);
+ }
+
+ write_unlock_tree();
+
+ /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
+ if (neighbor != NULL && (flags & GN_NO_ALLOC))
+ /* atomic_dec(&ZJNODE(neighbor)->x_count); */
+ zput(neighbor);
+
+ return ret;
+}
+
+/* This function is for establishing of one side relation. */
+/* Audited by: umka (2002.06.14) */
+static int connect_one_side(coord_t * coord, znode * node, int flags)
+{
+ coord_t local;
+ lock_handle handle;
+ int nr_locked;
+ int ret;
+
+ assert("umka-248", coord != NULL);
+ assert("umka-249", node != NULL);
+
+ coord_dup_nocheck(&local, coord);
+
+ init_lh(&handle);
+
+ ret =
+ renew_sibling_link(&local, &handle, node, znode_get_level(node),
+ flags | GN_NO_ALLOC, &nr_locked);
+
+ if (handle.node != NULL) {
+ /* complementary operations for zload() and lock() in far_next_coord() */
+ zrelse(handle.node);
+ longterm_unlock_znode(&handle);
+ }
+
+ /* we catch error codes which are not interesting for us because we
+ run renew_sibling_link() only for znode connection. */
+ if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
+ return 0;
+
+ return ret;
+}
+
+/* if @child is not in `connected' state, performs hash searches for left and
+ right neighbor nodes and establishes horizontal sibling links */
+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
+int connect_znode(coord_t * parent_coord, znode * child)
+{
+ int ret = 0;
+
+ assert("zam-330", parent_coord != NULL);
+ assert("zam-331", child != NULL);
+ assert("zam-332", parent_coord->node != NULL);
+ assert("umka-305", znode_get_tree(child) != NULL);
+
+ /* it is trivial to `connect' root znode because it can't have
+ neighbors */
+ if (znode_above_root(parent_coord->node)) {
+ child->left = NULL;
+ child->right = NULL;
+ ZF_SET(child, JNODE_LEFT_CONNECTED);
+ ZF_SET(child, JNODE_RIGHT_CONNECTED);
+
+ ON_DEBUG(child->left_version =
+ atomic_inc_return(&delim_key_version);
+ child->right_version =
+ atomic_inc_return(&delim_key_version););
+
+ return 0;
+ }
+
+ /* load parent node */
+ coord_clear_iplug(parent_coord);
+ ret = zload(parent_coord->node);
+
+ if (ret != 0)
+ return ret;
+
+ /* protect `connected' state check by tree_lock */
+ read_lock_tree();
+
+ if (!znode_is_right_connected(child)) {
+ read_unlock_tree();
+ /* connect right (default is right) */
+ ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
+ if (ret)
+ goto zrelse_and_ret;
+
+ read_lock_tree();
+ }
+
+ ret = znode_is_left_connected(child);
+
+ read_unlock_tree();
+
+ if (!ret) {
+ ret =
+ connect_one_side(parent_coord, child,
+ GN_NO_ALLOC | GN_GO_LEFT);
+ } else
+ ret = 0;
+
+ zrelse_and_ret:
+ zrelse(parent_coord->node);
+
+ return ret;
+}
+
+/* this function is like renew_sibling_link() but allocates neighbor node if
+ it doesn't exist and `connects' it. It may require making two steps in
+ horizontal direction, first one for neighbor node finding/allocation,
+ second one is for finding neighbor of neighbor to connect freshly allocated
+ znode. */
+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
+static int
+renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
+{
+ coord_t local;
+ lock_handle empty[2];
+ znode *neighbor = NULL;
+ int nr_locked = 0;
+ int ret;
+
+ assert("umka-250", coord != NULL);
+ assert("umka-251", node != NULL);
+ assert("umka-307", znode_get_tree(node) != NULL);
+ assert("umka-308", level <= znode_get_tree(node)->height);
+
+ /* umka (2002.06.14)
+ Here probably should be a check for given "level" validness.
+ Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
+ */
+
+ coord_dup(&local, coord);
+
+ ret =
+ renew_sibling_link(&local, &empty[0], node, level,
+ flags & ~GN_NO_ALLOC, &nr_locked);
+ if (ret)
+ goto out;
+
+ /* tree lock is not needed here because we keep parent node(s) locked
+ and reference to neighbor znode incremented */
+ neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
+
+ read_lock_tree();
+ ret = znode_is_connected(neighbor);
+ read_unlock_tree();
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+
+ ret =
+ renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
+ flags | GN_NO_ALLOC, &nr_locked);
+ /* second renew_sibling_link() call is used for znode connection only,
+ so we can live with these errors */
+ if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
+ ret = 0;
+
+ out:
+
+ for (--nr_locked; nr_locked >= 0; --nr_locked) {
+ zrelse(empty[nr_locked].node);
+ longterm_unlock_znode(&empty[nr_locked]);
+ }
+
+ if (neighbor != NULL)
+ /* decrement znode reference counter without actually
+ releasing it. */
+ atomic_dec(&ZJNODE(neighbor)->x_count);
+
+ return ret;
+}
+
+/*
+ reiser4_get_neighbor() -- lock node's neighbor.
+
+ reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
+ given parameter) using sibling link to it. If sibling link is not available
+ (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
+ level up for information about neighbor's disk address. We lock node's
+ parent, if it is common parent for both 'node' and its neighbor, neighbor's
+ disk address is in next (to left or to right) down link from link that points
+ to original node. If not, we need to lock parent's neighbor, read its content
+ and take first(last) downlink with neighbor's disk address. That locking
+ could be done by using sibling link and lock_neighbor() function, if sibling
+ link exists. In another case we have to go level up again until we find
+ common parent or valid sibling link. Then go down
+ allocating/connecting/locking/reading nodes until neighbor of first one is
+ locked.
+
+ @neighbor: result lock handle,
+ @node: a node which we lock neighbor of,
+ @lock_mode: lock mode {LM_READ, LM_WRITE},
+ @flags: logical OR of {GN_*} (see description above) subset.
+
+ @return: 0 if success, negative value if lock was impossible due to an error
+ or lack of neighbor node.
+*/
+
+/* Audited by: umka (2002.06.14), umka (2002.06.15) */
+int
+reiser4_get_neighbor(lock_handle * neighbor, znode * node,
+ znode_lock_mode lock_mode, int flags)
+{
+ lock_handle path[REAL_MAX_ZTREE_HEIGHT];
+
+ coord_t coord;
+
+ tree_level base_level;
+ tree_level h = 0;
+ int ret;
+
+ assert("umka-252", znode_get_tree(node) != NULL);
+ assert("umka-253", neighbor != NULL);
+ assert("umka-254", node != NULL);
+
+ base_level = znode_get_level(node);
+
+ assert("umka-310", base_level <= znode_get_tree(node)->height);
+
+ coord_init_zero(&coord);
+
+ again:
+ /* first, we try to use simple lock_neighbor() which requires sibling
+ link existence */
+ read_lock_tree();
+ ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
+ read_unlock_tree();
+ if (!ret) {
+ /* load znode content if it was specified */
+ if (flags & GN_LOAD_NEIGHBOR) {
+ ret = zload(node);
+ if (ret)
+ longterm_unlock_znode(neighbor);
+ }
+ return ret;
+ }
+
+ /* only -ENOENT means we may look upward and try to connect
+ @node with its neighbor (if @flags allow us to do it) */
+ if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
+ return ret;
+
+ /* before establishing of sibling link we lock parent node; it is
+ required by renew_neighbor() to work. */
+ init_lh(&path[0]);
+ ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
+ if (ret)
+ return ret;
+ if (znode_above_root(path[0].node)) {
+ longterm_unlock_znode(&path[0]);
+ return RETERR(-E_NO_NEIGHBOR);
+ }
+
+ while (1) {
+ znode *child = (h == 0) ? node : path[h - 1].node;
+ znode *parent = path[h].node;
+
+ ret = zload(parent);
+ if (ret)
+ break;
+
+ ret = find_child_ptr(parent, child, &coord);
+
+ if (ret) {
+ zrelse(parent);
+ break;
+ }
+
+ /* try to establish missing sibling link */
+ ret = renew_neighbor(&coord, child, h + base_level, flags);
+
+ zrelse(parent);
+
+ switch (ret) {
+ case 0:
+ /* unlocking of parent znode prevents simple
+ deadlock situation */
+ done_lh(&path[h]);
+
+ /* depend on tree level we stay on we repeat first
+ locking attempt ... */
+ if (h == 0)
+ goto again;
+
+ /* ... or repeat establishing of sibling link at
+ one level below. */
+ --h;
+ break;
+
+ case -ENOENT:
+ /* sibling link is not available -- we go
+ upward. */
+ init_lh(&path[h + 1]);
+ ret =
+ reiser4_get_parent(&path[h + 1], parent,
+ ZNODE_READ_LOCK);
+ if (ret)
+ goto fail;
+ ++h;
+ if (znode_above_root(path[h].node)) {
+ ret = RETERR(-E_NO_NEIGHBOR);
+ goto fail;
+ }
+ break;
+
+ case -E_DEADLOCK:
+ /* there was lock request from hi-pri locker. if
+ it is possible we unlock last parent node and
+ re-lock it again. */
+ for (; reiser4_check_deadlock(); h--) {
+ done_lh(&path[h]);
+ if (h == 0)
+ goto fail;
+ }
+
+ break;
+
+ default: /* other errors. */
+ goto fail;
+ }
+ }
+ fail:
+ ON_DEBUG(check_lock_node_data(node));
+ ON_DEBUG(check_lock_data());
+
+ /* unlock path */
+ do {
+ /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
+ fail; path[0] is already done_lh-ed, therefore
+ longterm_unlock_znode(&path[h]); is not applicable */
+ done_lh(&path[h]);
+ --h;
+ } while (h + 1 != 0);
+
+ return ret;
+}
+
+/* remove node from sibling list */
+/* Audited by: umka (2002.06.14) */
+void sibling_list_remove(znode * node)
+{
+ reiser4_tree *tree;
+
+ tree = znode_get_tree(node);
+ assert("umka-255", node != NULL);
+ assert_rw_write_locked(&(tree->tree_lock));
+ assert("nikita-3275", check_sibling_list(node));
+
+ write_lock_dk(tree);
+ if (znode_is_right_connected(node) && node->right != NULL &&
+ znode_is_left_connected(node) && node->left != NULL) {
+ assert("zam-32245",
+ keyeq(znode_get_rd_key(node),
+ znode_get_ld_key(node->right)));
+ znode_set_rd_key(node->left, znode_get_ld_key(node->right));
+ }
+ write_unlock_dk(tree);
+
+ if (znode_is_right_connected(node) && node->right != NULL) {
+ assert("zam-322", znode_is_left_connected(node->right));
+ node->right->left = node->left;
+ ON_DEBUG(node->right->left_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ }
+ if (znode_is_left_connected(node) && node->left != NULL) {
+ assert("zam-323", znode_is_right_connected(node->left));
+ node->left->right = node->right;
+ ON_DEBUG(node->left->right_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ }
+
+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
+ ON_DEBUG(node->left = node->right = NULL;
+ node->left_version = atomic_inc_return(&delim_key_version);
+ node->right_version = atomic_inc_return(&delim_key_version););
+ assert("nikita-3276", check_sibling_list(node));
+}
+
+/* disconnect node from sibling list */
+void sibling_list_drop(znode * node)
+{
+ znode *right;
+ znode *left;
+
+ assert("nikita-2464", node != NULL);
+ assert("nikita-3277", check_sibling_list(node));
+
+ right = node->right;
+ if (right != NULL) {
+ assert("nikita-2465", znode_is_left_connected(right));
+ right->left = NULL;
+ ON_DEBUG(right->left_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ }
+ left = node->left;
+ if (left != NULL) {
+ assert("zam-323", znode_is_right_connected(left));
+ left->right = NULL;
+ ON_DEBUG(left->right_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ }
+ ZF_CLR(node, JNODE_LEFT_CONNECTED);
+ ZF_CLR(node, JNODE_RIGHT_CONNECTED);
+ ON_DEBUG(node->left = node->right = NULL;
+ node->left_version = atomic_inc_return(&delim_key_version);
+ node->right_version = atomic_inc_return(&delim_key_version););
+}
+
+/* Insert new node into sibling list. Regular balancing inserts new node
+ after (at right side) existing and locked node (@before), except one case
+ of adding new tree root node. @before should be NULL in that case. */
+void sibling_list_insert_nolock(znode * new, znode * before)
+{
+ assert("zam-334", new != NULL);
+ assert("nikita-3298", !znode_is_left_connected(new));
+ assert("nikita-3299", !znode_is_right_connected(new));
+ assert("nikita-3300", new->left == NULL);
+ assert("nikita-3301", new->right == NULL);
+ assert("nikita-3278", check_sibling_list(new));
+ assert("nikita-3279", check_sibling_list(before));
+
+ if (before != NULL) {
+ assert("zam-333", znode_is_connected(before));
+ new->right = before->right;
+ new->left = before;
+ ON_DEBUG(new->right_version =
+ atomic_inc_return(&delim_key_version);
+ new->left_version =
+ atomic_inc_return(&delim_key_version););
+ if (before->right != NULL) {
+ before->right->left = new;
+ ON_DEBUG(before->right->left_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ }
+ before->right = new;
+ ON_DEBUG(before->right_version =
+ atomic_inc_return(&delim_key_version);
+ );
+ } else {
+ new->right = NULL;
+ new->left = NULL;
+ ON_DEBUG(new->right_version =
+ atomic_inc_return(&delim_key_version);
+ new->left_version =
+ atomic_inc_return(&delim_key_version););
+ }
+ ZF_SET(new, JNODE_LEFT_CONNECTED);
+ ZF_SET(new, JNODE_RIGHT_CONNECTED);
+ assert("nikita-3280", check_sibling_list(new));
+ assert("nikita-3281", check_sibling_list(before));
+}
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/tree_walk.h linux-5.10.2/fs/reiser4/tree_walk.h
--- linux-5.10.2.orig/fs/reiser4/tree_walk.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/tree_walk.h 2020-12-23 16:07:46.136813392 +0100
@@ -0,0 +1,125 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+/* definitions of reiser4 tree walk functions */
+
+#ifndef __FS_REISER4_TREE_WALK_H__
+#define __FS_REISER4_TREE_WALK_H__
+
+#include "debug.h"
+#include "forward.h"
+
+/* establishes horizontal links between cached znodes */
+int connect_znode(coord_t * coord, znode * node);
+
+/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
+ have the following common arguments:
+
+ return codes:
+
+ @return : 0 - OK,
+
+ZAM-FIXME-HANS: wrong return code name. Change them all.
+ -ENOENT - neighbor is not in cache, what is detected by sibling
+ link absence.
+
+ -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
+ found (because we are left-/right- most node of the
+ tree, for example). Also, this return code is for
+ reiser4_get_parent() when we see no parent link -- it
+ means that our node is root node.
+
+ -E_DEADLOCK - deadlock detected (request from high-priority process
+ received), other error codes are conformed to
+ /usr/include/asm/errno.h .
+*/
+
+int
+reiser4_get_parent_flags(lock_handle * result, znode * node,
+ znode_lock_mode mode, int flags);
+
+/* bits definition for reiser4_get_neighbor function `flags' arg. */
+typedef enum {
+ /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
+ * find not allocated not connected neigbor by going though upper
+ * levels */
+ GN_CAN_USE_UPPER_LEVELS = 0x1,
+ /* locking left neighbor instead of right one */
+ GN_GO_LEFT = 0x2,
+ /* automatically load neighbor node content */
+ GN_LOAD_NEIGHBOR = 0x4,
+ /* return -E_REPEAT if can't lock */
+ GN_TRY_LOCK = 0x8,
+ /* used internally in tree_walk.c, causes renew_sibling to not
+ allocate neighbor znode, but only search for it in znode cache */
+ GN_NO_ALLOC = 0x10,
+ /* do not go across atom boundaries */
+ GN_SAME_ATOM = 0x20,
+ /* allow to lock not connected nodes */
+ GN_ALLOW_NOT_CONNECTED = 0x40,
+ /* Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
+ GN_ASYNC = 0x80
+} znode_get_neigbor_flags;
+
+/* A commonly used wrapper for reiser4_get_parent_flags(). */
+static inline int reiser4_get_parent(lock_handle * result, znode * node,
+ znode_lock_mode mode)
+{
+ return reiser4_get_parent_flags(result, node, mode,
+ GN_ALLOW_NOT_CONNECTED);
+}
+
+int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
+ znode_lock_mode lock_mode, int flags);
+
+/* there are wrappers for most common usages of reiser4_get_neighbor() */
+static inline int
+reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
+ int flags)
+{
+ return reiser4_get_neighbor(result, node, lock_mode,
+ flags | GN_GO_LEFT);
+}
+
+static inline int
+reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
+ int flags)
+{
+ ON_DEBUG(check_lock_node_data(node));
+ ON_DEBUG(check_lock_data());
+ return reiser4_get_neighbor(result, node, lock_mode,
+ flags & (~GN_GO_LEFT));
+}
+
+extern void sibling_list_remove(znode * node);
+extern void sibling_list_drop(znode * node);
+extern void sibling_list_insert_nolock(znode * new, znode * before);
+extern void link_left_and_right(znode * left, znode * right);
+
+/* Functions called by tree_walk() when tree_walk() ... */
+struct tree_walk_actor {
+ /* ... meets a formatted node, */
+ int (*process_znode) (tap_t *, void *);
+ /* ... meets an extent, */
+ int (*process_extent) (tap_t *, void *);
+ /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
+ * node or extent processing functions. */
+ int (*before) (void *);
+};
+
+#if REISER4_DEBUG
+int check_sibling_list(znode * node);
+#else
+#define check_sibling_list(n) (1)
+#endif
+
+#endif /* __FS_REISER4_TREE_WALK_H__ */
+
+/*
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/txnmgr.c linux-5.10.2/fs/reiser4/txnmgr.c
--- linux-5.10.2.orig/fs/reiser4/txnmgr.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/txnmgr.c 2020-12-23 16:12:07.563664794 +0100
@@ -0,0 +1,3556 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Joshua MacDonald wrote the first draft of this code. */
+
+/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
+filesystem scales only as well as its worst locking design. You need to
+substantially restructure this code. Josh was not as experienced a programmer
+as you. Particularly review how the locking style differs from what you did
+for znodes usingt hi-lo priority locking, and present to me an opinion on
+whether the differences are well founded. */
+
+/* I cannot help but to disagree with the sentiment above. Locking of
+ * transaction manager is _not_ badly designed, and, at the very least, is not
+ * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
+ * locking on znodes, especially on the root node of the tree. --nikita,
+ * 2003.10.13 */
+
+/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles. The
+ txnmgr processes capture_block requests and manages the relationship between jnodes and
+ atoms through the various stages of a transcrash, and it also oversees the fusion and
+ capture-on-copy processes. The main difficulty with this task is maintaining a
+ deadlock-free lock ordering between atoms and jnodes/handles. The reason for the
+ difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
+ must be broken. The main requirement is that atom-fusion be deadlock free, so once you
+ hold the atom_lock you may then wait to acquire any jnode or handle lock. This implies
+ that any time you check the atom-pointer of a jnode or handle and then try to lock that
+ atom, you must use trylock() and possibly reverse the order.
+
+ This code implements the design documented at:
+
+ http://namesys.com/txn-doc.html
+
+ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
+above document and reference the new. Be sure to provide some credit to Josh. I already have some writings on this
+topic in v4.html, but they are lacking in details present in the above. Cure that. Remember to write for the bright 12
+year old --- define all technical terms used.
+
+*/
+
+/* Thoughts on the external transaction interface:
+
+ In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
+ creates state that lasts for the duration of a system call and is called at the start
+ of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
+ occupying the scope of a single system call. We wish to give certain applications an
+ interface to begin and close (commit) transactions. Since our implementation of
+ transactions does not yet support isolation, allowing an application to open a
+ transaction implies trusting it to later close the transaction. Part of the
+ transaction interface will be aimed at enabling that trust, but the interface for
+ actually using transactions is fairly narrow.
+
+ BEGIN_TRANSCRASH: Returns a transcrash identifier. It should be possible to translate
+ this identifier into a string that a shell-script could use, allowing you to start a
+ transaction by issuing a command. Once open, the transcrash should be set in the task
+ structure, and there should be options (I suppose) to allow it to be carried across
+ fork/exec. A transcrash has several options:
+
+ - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
+ on writes (WRITE_FUSING) and allow "dirty reads". If the application wishes to
+ capture on reads as well, it should set READ_FUSING.
+
+ - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
+ eventually close (or else the machine must crash). If the application dies an
+ unexpected death with an open transcrash, for example, or if it hangs for a long
+ duration, one solution (to avoid crashing the machine) is to simply close it anyway.
+ This is a dangerous option, but it is one way to solve the problem until isolated
+ transcrashes are available for untrusted applications.
+
+ It seems to be what databases do, though it is unclear how one avoids a DoS attack
+ creating a vulnerability based on resource starvation. Guaranteeing that some
+ minimum amount of computational resources are made available would seem more correct
+ than guaranteeing some amount of time. When we again have someone to code the work,
+ this issue should be considered carefully. -Hans
+
+ RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
+ many dirty blocks it expects. The reserve_blocks interface should be called at a point
+ where it is safe for the application to fail, because the system may not be able to
+ grant the allocation and the application must be able to back-out. For this reason,
+ the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
+ the application may also wish to extend the allocation after beginning its transcrash.
+
+ CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
+ modifications that require transaction protection. When isolated transactions are
+ supported the CLOSE operation is replaced by either COMMIT or ABORT. For example, if a
+ RESERVE_BLOCKS call fails for the application, it should "abort" by calling
+ CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
+ why, for safety, the application should call RESERVE_BLOCKS before making any changes).
+
+ For actually implementing these out-of-system-call-scopped transcrashes, the
+ reiser4_context has a "txn_handle *trans" pointer that may be set to an open
+ transcrash. Currently there are no dynamically-allocated transcrashes, but there is a
+ "struct kmem_cache *_txnh_slab" created for that purpose in this file.
+*/
+
+/* Extending the other system call interfaces for future transaction features:
+
+ Specialized applications may benefit from passing flags to the ordinary system call
+ interface such as read(), write(), or stat(). For example, the application specifies
+ WRITE_FUSING by default but wishes to add that a certain read() command should be
+ treated as READ_FUSING. But which read? Is it the directory-entry read, the stat-data
+ read, or the file-data read? These issues are straight-forward, but there are a lot of
+ them and adding the necessary flags-passing code will be tedious.
+
+ When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
+ flag, which specifies that although it is a read operation being requested, a
+ write-lock should be taken. The reason is that read-locks are shared while write-locks
+ are exclusive, so taking a read-lock when a later-write is known in advance will often
+ leads to deadlock. If a reader knows it will write later, it should issue read
+ requests with the RMW flag set.
+*/
+
+/*
+ The znode/atom deadlock avoidance.
+
+ FIXME(Zam): writing of this comment is in progress.
+
+ The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
+ long-term locking, which makes reiser4 locking scheme more complex. It had
+ deadlocks until we implement deadlock avoidance algorithms. That deadlocks
+ looked as the following: one stopped thread waits for a long-term lock on
+ znode, the thread who owns that lock waits when fusion with another atom will
+ be allowed.
+
+ The source of the deadlocks is an optimization of not capturing index nodes
+ for read. Let's prove it. Suppose we have dumb node capturing scheme which
+ unconditionally captures each block before locking it.
+
+ That scheme has no deadlocks. Let's begin with the thread which stage is
+ ASTAGE_CAPTURE_WAIT and it waits for a znode lock. The thread can't wait for
+ a capture because it's stage allows fusion with any atom except which are
+ being committed currently. A process of atom commit can't deadlock because
+ atom commit procedure does not acquire locks and does not fuse with other
+ atoms. Reiser4 does capturing right before going to sleep inside the
+ longtertm_lock_znode() function, it means the znode which we want to lock is
+ already captured and its atom is in ASTAGE_CAPTURE_WAIT stage. If we
+ continue the analysis we understand that no one process in the sequence may
+ waits atom fusion. Thereby there are no deadlocks of described kind.
+
+ The capturing optimization makes the deadlocks possible. A thread can wait a
+ lock which owner did not captured that node. The lock owner's current atom
+ is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
+ state. A deadlock is possible when that atom meets another one which is in
+ ASTAGE_CAPTURE_WAIT already.
+
+ The deadlock avoidance scheme includes two algorithms:
+
+ First algorithm is used when a thread captures a node which is locked but not
+ captured by another thread. Those nodes are marked MISSED_IN_CAPTURE at the
+ moment we skip their capturing. If such a node (marked MISSED_IN_CAPTURE) is
+ being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
+ routine which forces all lock owners to join with current atom is executed.
+
+ Second algorithm does not allow to skip capturing of already captured nodes.
+
+ Both algorithms together prevent waiting a longterm lock without atom fusion
+ with atoms of all lock owners, which is a key thing for getting atom/znode
+ locking deadlocks.
+*/
+
+/*
+ * Transactions and mmap(2).
+ *
+ * 1. Transactions are not supported for accesses through mmap(2), because
+ * this would effectively amount to user-level transactions whose duration
+ * is beyond control of the kernel.
+ *
+ * 2. That said, we still want to preserve some decency with regard to
+ * mmap(2). During normal write(2) call, following sequence of events
+ * happens:
+ *
+ * 1. page is created;
+ *
+ * 2. jnode is created, dirtied and captured into current atom.
+ *
+ * 3. extent is inserted and modified.
+ *
+ * Steps (2) and (3) take place under long term lock on the twig node.
+ *
+ * When file is accessed through mmap(2) page is always created during
+ * page fault.
+ * After this (in reiser4_readpage_dispatch()->reiser4_readpage_extent()):
+ *
+ * 1. if access is made to non-hole page new jnode is created, (if
+ * necessary)
+ *
+ * 2. if access is made to the hole page, jnode is not created (XXX
+ * not clear why).
+ *
+ * Also, even if page is created by write page fault it is not marked
+ * dirty immediately by handle_mm_fault(). Probably this is to avoid races
+ * with page write-out.
+ *
+ * Dirty bit installed by hardware is only transferred to the struct page
+ * later, when page is unmapped (in zap_pte_range(), or
+ * try_to_unmap_one()).
+ *
+ * So, with mmap(2) we have to handle following irksome situations:
+ *
+ * 1. there exists modified page (clean or dirty) without jnode
+ *
+ * 2. there exists modified page (clean or dirty) with clean jnode
+ *
+ * 3. clean page which is a part of atom can be transparently modified
+ * at any moment through mapping without becoming dirty.
+ *
+ * (1) and (2) can lead to the out-of-memory situation: ->writepage()
+ * doesn't know what to do with such pages and ->sync_sb()/->writepages()
+ * don't see them, because these methods operate on atoms.
+ *
+ * (3) can lead to the loss of data: suppose we have dirty page with dirty
+ * captured jnode captured by some atom. As part of early flush (for
+ * example) page was written out. Dirty bit was cleared on both page and
+ * jnode. After this page is modified through mapping, but kernel doesn't
+ * notice and just discards page and jnode as part of commit. (XXX
+ * actually it doesn't, because to reclaim page ->releasepage() has to be
+ * called and before this dirty bit will be transferred to the struct
+ * page).
+ *
+ */
+
+#include "debug.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "wander.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "page_cache.h"
+#include "reiser4.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "flush.h"
+#include "discard.h"
+#include "plugin/volume/volume.h"
+
+#include <asm/atomic.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h> /* for totalram_pages */
+
+static void free_atom(txn_atom * atom);
+
+static int commit_txnh(txn_handle * txnh);
+
+static void wakeup_atom_waitfor_list(txn_atom * atom);
+static void wakeup_atom_waiting_list(txn_atom * atom);
+
+static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
+
+static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
+
+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
+
+static int capture_init_fusion(jnode * node, txn_handle * txnh,
+ txn_capture mode);
+
+static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
+
+static void capture_fuse_into(txn_atom * small, txn_atom * large);
+
+void reiser4_invalidate_list(struct list_head *);
+
+/* GENERIC STRUCTURES */
+
+typedef struct _txn_wait_links txn_wait_links;
+
+struct _txn_wait_links {
+ lock_stack *_lock_stack;
+ struct list_head _fwaitfor_link;
+ struct list_head _fwaiting_link;
+ int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
+ int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
+};
+
+/* FIXME: In theory, we should be using the slab cache init & destructor
+ methods instead of, e.g., jnode_init, etc. */
+static struct kmem_cache *_atom_slab = NULL;
+/* this is for user-visible, cross system-call transactions. */
+static struct kmem_cache *_txnh_slab = NULL;
+static struct kmem_cache *_abi_slab = NULL;
+
+struct atom_brick_info *alloc_atom_brick_info(void)
+{
+ return kmem_cache_alloc(_abi_slab, reiser4_ctx_gfp_mask_get());
+}
+
+void free_atom_brick_info(struct atom_brick_info *abi)
+{
+ assert("edward-1979", abi != NULL);
+
+ kmem_cache_free(_abi_slab, abi);
+}
+
+struct atom_brick_info *find_atom_brick_info(const struct rb_root *root,
+ u32 brick_id)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct atom_brick_info *abi =
+ rb_entry(node, struct atom_brick_info, node);
+
+ if (abi->brick_id > brick_id)
+ node = node->rb_left;
+ else if (abi->brick_id < brick_id)
+ node = node->rb_right;
+ else
+ return abi;
+ }
+ return NULL;
+}
+
+#if REISER4_DEBUG
+void __check_atom_brick_info(struct rb_root *root)
+{
+ struct rb_node *node;
+
+ for (node = rb_first(root);
+ node;
+ node = rb_next(node)) {
+ struct atom_brick_info *abi;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ assert("edward-2007",
+ abi == find_atom_brick_info(root, abi->brick_id));
+ }
+}
+
+void check_atom_brick_info(txn_atom *atom)
+{
+ struct rb_root *root = &atom->bricks_info;
+ struct rb_node *node;
+
+ for (node = rb_first(root);
+ node;
+ node = rb_next(node)) {
+ atom->abi = rb_entry(node, struct atom_brick_info, node);
+ atom->abi_found =
+ find_atom_brick_info(root, atom->abi->brick_id);
+ assert("edward-2008", atom->abi == atom->abi_found);
+ }
+}
+#endif /* REISER4_DEBUG */
+
+/**
+ * Try to insert item @this to rb-tree @root
+ * Return NULL on success. Otherwise, return node of existing item
+ */
+struct atom_brick_info *insert_atom_brick_info(struct rb_root *root,
+ struct atom_brick_info *this)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **pos = &(root->rb_node);
+
+ while (*pos) {
+ struct atom_brick_info *abi;
+
+ abi = rb_entry(*pos, struct atom_brick_info, node);
+ parent = *pos;
+
+ if (this->brick_id < abi->brick_id)
+ pos = &((*pos)->rb_left);
+ else if (this->brick_id > abi->brick_id)
+ pos = &((*pos)->rb_right);
+ else
+ return abi;
+ }
+ rb_link_node(&this->node, parent, pos);
+ rb_insert_color(&this->node, root);
+
+ __check_atom_brick_info(root);
+
+ return NULL;
+}
+
+/**
+ * On sucess: 0 is returned and @this points
+ * to existing or inserted atom brick info.
+ */
+int __check_insert_atom_brick_info(txn_atom **atom, u32 brick_id,
+ struct atom_brick_info **this)
+{
+ struct atom_brick_info *abi;
+
+ assert("edward-2009", atom != NULL);
+ assert("edward-2010", *atom != NULL);
+ assert("edward-2011", this != NULL);
+ assert_spin_locked(&((*atom)->alock));
+
+ if (brick_id == METADATA_SUBVOL_ID) {
+ /*
+ * It is known to be preallocated
+ */
+ *this = atom_meta_brick_info(*atom);
+ return 0;
+ }
+ abi = find_atom_brick_info(&((*atom)->bricks_info), brick_id);
+ if (abi == NULL) {
+ /*
+ * Insert a new item to the tree
+ */
+ spin_unlock_atom(*atom);
+ abi = alloc_atom_brick_info();
+ if (abi == NULL)
+ return -ENOMEM;
+ init_atom_brick_info(abi, brick_id);
+ *atom = get_current_atom_locked();
+ *this = insert_atom_brick_info(&(*atom)->bricks_info, abi);
+ if (*this != NULL) {
+ /*
+ * someone has already inserted
+ * an item with such key after we
+ * unlocked the atom
+ */
+ free_atom_brick_info(abi);
+ return 0;
+ }
+ }
+ *this = abi;
+ return 0;
+}
+
+int check_insert_atom_brick_info(u32 brick_id, struct atom_brick_info **this)
+{
+ int ret;
+ txn_atom *atom;
+
+ atom = get_current_atom_locked();
+ ret = __check_insert_atom_brick_info(&atom, brick_id, this);
+ if (ret)
+ return ret;
+ spin_unlock_atom(atom);
+ return 0;
+}
+
+static void done_atom_bricks_info(txn_atom *atom)
+{
+ struct rb_root *root;
+
+ root = &atom->bricks_info;
+ /*
+ * remove pre-allocated info
+ */
+ rb_erase(&atom->mabi.node, root);
+ RB_CLEAR_NODE(&atom->mabi.node);
+
+ while (!RB_EMPTY_ROOT(root)) {
+ struct rb_node *node;
+ struct atom_brick_info *abi;
+
+ node = rb_first(root);
+ abi = rb_entry(node, struct atom_brick_info, node);
+
+ rb_erase(&abi->node, root);
+ RB_CLEAR_NODE(&abi->node);
+ free_atom_brick_info(abi);
+ }
+}
+
+#if REISER4_DEBUG
+void check_atom_flush_reserved(txn_atom *atom)
+{
+ struct rb_node *node;
+
+ assert_spin_locked(&(atom->alock));
+
+ check_atom_brick_info(atom);
+
+ spin_lock_reiser4_super(get_current_super_private());
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+ struct atom_brick_info *abi;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ assert("edward-2012",
+ abi->atom_flush_reserved <=
+ current_origin(abi->brick_id)->blocks_flush_reserved);
+ }
+ spin_unlock_reiser4_super(get_current_super_private());
+}
+#endif
+
+/**
+ * merge items representing per-brick atom's info
+ */
+static void fuse_abi(txn_atom *from, txn_atom *to)
+{
+ struct rb_node *node;
+ struct atom_brick_info *mabi_from, *mabi_to;
+ /*
+ * start from fusing data of pre-allocated items
+ */
+ mabi_from = atom_meta_brick_info(from);
+ mabi_to = atom_meta_brick_info(to);
+
+ mabi_to->atom_flush_reserved += mabi_from->atom_flush_reserved;
+ mabi_to->nr_blocks_allocated += mabi_from->nr_blocks_allocated;
+
+ mabi_from->atom_flush_reserved = 0;
+ mabi_from->nr_blocks_allocated = 0;
+
+ node = rb_next(&mabi_from->node);
+ while (node) {
+ struct rb_node *node_from;
+ struct atom_brick_info *abi_from;
+ struct atom_brick_info *abi_to;
+
+ node_from = node;
+ node = rb_next(node);
+
+ /* try to move the item to the @to's tree */
+
+ rb_erase(node_from, &from->bricks_info);
+ RB_CLEAR_NODE(node_from);
+
+ abi_from = rb_entry(node_from, struct atom_brick_info, node);
+ abi_to = insert_atom_brick_info(&to->bricks_info, abi_from);
+
+ if (abi_to != NULL) {
+ /*
+ * can't insert: an item with such brick_id
+ * already exists in the @to's rb-tree, so
+ * simply update the existing item, and
+ * release the item that we wanted to insert
+ */
+ assert("edward-2013", abi_to->brick_id != 0);
+ assert("edward-2014", abi_from->brick_id == abi_to->brick_id);
+
+ abi_to->atom_flush_reserved += abi_from->atom_flush_reserved;
+ abi_to->nr_blocks_allocated += abi_from->nr_blocks_allocated;
+ free_atom_brick_info(abi_from);
+ }
+ }
+ /*
+ * after fusion the @from's rb-tree contains only pre-allocated item
+ */
+ assert("edward-2015",
+ from->bricks_info.rb_node != NULL &&
+ from->bricks_info.rb_node->rb_left == NULL &&
+ from->bricks_info.rb_node->rb_right == NULL);
+
+ check_atom_brick_info(to);
+}
+
+/**
+ * init_txnmgr_static - create transaction manager slab caches
+ *
+ * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
+ * initialization.
+ */
+int init_txnmgr_static(void)
+{
+ assert("jmacd-600", _atom_slab == NULL);
+ assert("jmacd-601", _txnh_slab == NULL);
+ assert("edward-2016", _abi_slab == NULL);
+
+ ON_DEBUG(atomic_set(&flush_cnt, 0));
+
+ _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (_atom_slab == NULL)
+ return RETERR(-ENOMEM);
+
+ _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (_txnh_slab == NULL) {
+ kmem_cache_destroy(_atom_slab);
+ _atom_slab = NULL;
+ return RETERR(-ENOMEM);
+ }
+
+ _abi_slab = kmem_cache_create("atom_brick_info",
+ sizeof(struct atom_brick_info), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (_abi_slab == NULL) {
+ kmem_cache_destroy(_atom_slab);
+ kmem_cache_destroy(_txnh_slab);
+ _atom_slab = NULL;
+ _txnh_slab = NULL;
+ return RETERR(-ENOMEM);
+ }
+ return 0;
+}
+
+/**
+ * done_txnmgr_static - delete txn_atom and txn_handle caches
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void done_txnmgr_static(void)
+{
+ destroy_reiser4_cache(&_atom_slab);
+ destroy_reiser4_cache(&_txnh_slab);
+ destroy_reiser4_cache(&_abi_slab);
+}
+
+/**
+ * init_txnmgr - initialize a new transaction manager
+ * @mgr: pointer to transaction manager embedded in reiser4 super block
+ *
+ * This is called on mount. Makes necessary initializations.
+ */
+void reiser4_init_txnmgr(txn_mgr *mgr)
+{
+ assert("umka-169", mgr != NULL);
+
+ mgr->atom_count = 0;
+ mgr->id_count = 1;
+ INIT_LIST_HEAD(&mgr->atoms_list);
+ spin_lock_init(&mgr->tmgr_lock);
+ mutex_init(&mgr->commit_mutex);
+}
+
+/**
+ * reiser4_done_txnmgr - stop transaction manager
+ * @mgr: pointer to transaction manager embedded in reiser4 super block
+ *
+ * This is called on umount. Does sanity checks.
+ */
+void reiser4_done_txnmgr(txn_mgr *mgr)
+{
+ assert("umka-170", mgr != NULL);
+ assert("umka-1701", list_empty_careful(&mgr->atoms_list));
+ assert("umka-1702", mgr->atom_count == 0);
+}
+
+/* Initialize a transaction handle. */
+/* Audited by: umka (2002.06.13) */
+static void txnh_init(txn_handle * txnh, txn_mode mode)
+{
+ assert("umka-171", txnh != NULL);
+
+ txnh->mode = mode;
+ txnh->atom = NULL;
+ reiser4_ctx_gfp_mask_set();
+ txnh->flags = 0;
+ spin_lock_init(&txnh->hlock);
+ INIT_LIST_HEAD(&txnh->txnh_link);
+}
+
+#if REISER4_DEBUG
+/* Check if a transaction handle is clean. */
+static int txnh_isclean(txn_handle * txnh)
+{
+ assert("umka-172", txnh != NULL);
+ return txnh->atom == NULL &&
+ LOCK_CNT_NIL(spin_locked_txnh);
+}
+#endif
+
+/* Initialize an atom. */
+static void init_atom(txn_atom * atom)
+{
+ int level;
+
+ assert("umka-173", atom != NULL);
+
+ atom->stage = ASTAGE_FREE;
+ atom->start_time = jiffies;
+ /*
+ * init set of per-brick info and populate it
+ * with pree-allocated item for meta-data brick
+ */
+ atom->bricks_info = RB_ROOT;
+ init_atom_brick_info(&atom->mabi, METADATA_SUBVOL_ID);
+ insert_atom_brick_info(&atom->bricks_info, &atom->mabi);
+
+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
+ INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
+
+ INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
+ INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
+ INIT_LIST_HEAD(ATOM_WB_LIST(atom));
+ INIT_LIST_HEAD(&atom->inodes);
+ spin_lock_init(&(atom->alock));
+ /* list of transaction handles */
+ INIT_LIST_HEAD(&atom->txnh_list);
+ /* link to transaction manager's list of atoms */
+ INIT_LIST_HEAD(&atom->atom_link);
+ INIT_LIST_HEAD(&atom->fwaitfor_list);
+ INIT_LIST_HEAD(&atom->fwaiting_list);
+ atom_dset_init(atom);
+ init_atom_fq_parts(atom);
+}
+
+#if REISER4_DEBUG
+/* Check if an atom is clean. */
+static int atom_isclean(txn_atom * atom)
+{
+ int level;
+
+ assert("umka-174", atom != NULL);
+
+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
+ if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
+ return 0;
+ }
+ }
+
+ return atom->stage == ASTAGE_FREE &&
+ atom->txnh_count == 0 &&
+ atom->capture_count == 0 &&
+ atomic_read(&atom->refcount) == 0 &&
+ (&atom->atom_link == atom->atom_link.next &&
+ &atom->atom_link == atom->atom_link.prev) &&
+ list_empty_careful(&atom->txnh_list) &&
+ list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
+ list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
+ list_empty_careful(ATOM_WB_LIST(atom)) &&
+ list_empty_careful(&atom->fwaitfor_list) &&
+ list_empty_careful(&atom->fwaiting_list) &&
+ atom_fq_parts_are_clean(atom);
+}
+#endif
+
+/* Begin a transaction in this context. Currently this uses the reiser4_context's
+ trans_in_ctx, which means that transaction handles are stack-allocated. Eventually
+ this will be extended to allow transaction handles to span several contexts. */
+/* Audited by: umka (2002.06.13) */
+void reiser4_txn_begin(reiser4_context * context)
+{
+ assert("jmacd-544", context->trans == NULL);
+
+ context->trans = &context->trans_in_ctx;
+
+ /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
+ transcrash. Default should be TXN_WRITE_FUSING. Also, the _trans variable is
+ stack allocated right now, but we would like to allow for dynamically allocated
+ transcrashes that span multiple system calls.
+ */
+ txnh_init(context->trans, TXN_WRITE_FUSING);
+}
+
+/* Finish a transaction handle context. */
+int reiser4_txn_end(reiser4_context * context)
+{
+ long ret = 0;
+ txn_handle *txnh;
+
+ assert("umka-283", context != NULL);
+ assert("nikita-3012", reiser4_schedulable());
+ assert("vs-24", context == get_current_context());
+ assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
+
+ txnh = context->trans;
+ if (txnh != NULL) {
+ if (txnh->atom != NULL)
+ ret = commit_txnh(txnh);
+ assert("jmacd-633", txnh_isclean(txnh));
+ context->trans = NULL;
+ }
+ return ret;
+}
+
+void reiser4_txn_restart(reiser4_context * context)
+{
+ reiser4_txn_end(context);
+ reiser4_preempt_point();
+ reiser4_txn_begin(context);
+}
+
+void reiser4_txn_restart_current(void)
+{
+ reiser4_txn_restart(get_current_context());
+}
+
+/* TXN_ATOM */
+
+/* Get the atom belonging to a txnh, which is not locked. Return txnh locked. Locks atom, if atom
+ is not NULL. This performs the necessary spin_trylock to break the lock-ordering cycle. May
+ return NULL. */
+static txn_atom *txnh_get_atom(txn_handle * txnh)
+{
+ txn_atom *atom;
+
+ assert("umka-180", txnh != NULL);
+ assert_spin_not_locked(&(txnh->hlock));
+
+ while (1) {
+ spin_lock_txnh(txnh);
+ atom = txnh->atom;
+
+ if (atom == NULL)
+ break;
+
+ if (spin_trylock_atom(atom))
+ break;
+
+ atomic_inc(&atom->refcount);
+
+ spin_unlock_txnh(txnh);
+ spin_lock_atom(atom);
+ spin_lock_txnh(txnh);
+
+ if (txnh->atom == atom) {
+ atomic_dec(&atom->refcount);
+ break;
+ }
+
+ spin_unlock_txnh(txnh);
+ atom_dec_and_unlock(atom);
+ }
+
+ return atom;
+}
+
+/* Get the current atom and spinlock it if current atom present. May return NULL */
+txn_atom *get_current_atom_locked_nocheck(void)
+{
+ reiser4_context *cx;
+ txn_atom *atom;
+ txn_handle *txnh;
+
+ cx = get_current_context();
+ assert("zam-437", cx != NULL);
+
+ txnh = cx->trans;
+ assert("zam-435", txnh != NULL);
+
+ atom = txnh_get_atom(txnh);
+
+ spin_unlock_txnh(txnh);
+ return atom;
+}
+
+/* Get the atom belonging to a jnode, which is initially locked. Return with
+ both jnode and atom locked. This performs the necessary spin_trylock to
+ break the lock-ordering cycle. Assumes the jnode is already locked, and
+ returns NULL if atom is not set. */
+txn_atom *jnode_get_atom(jnode * node)
+{
+ txn_atom *atom;
+
+ assert("umka-181", node != NULL);
+
+ while (1) {
+ assert_spin_locked(&(node->guard));
+
+ atom = node->atom;
+ /* node is not in any atom */
+ if (atom == NULL)
+ break;
+
+ /* If atom is not locked, grab the lock and return */
+ if (spin_trylock_atom(atom))
+ break;
+
+ /* At least one jnode belongs to this atom it guarantees that
+ * atom->refcount > 0, we can safely increment refcount. */
+ atomic_inc(&atom->refcount);
+ spin_unlock_jnode(node);
+
+ /* re-acquire spin locks in the right order */
+ spin_lock_atom(atom);
+ spin_lock_jnode(node);
+
+ /* check if node still points to the same atom. */
+ if (node->atom == atom) {
+ atomic_dec(&atom->refcount);
+ break;
+ }
+
+ /* releasing of atom lock and reference requires not holding
+ * locks on jnodes. */
+ spin_unlock_jnode(node);
+
+ /* We do not sure that this atom has extra references except our
+ * one, so we should call proper function which may free atom if
+ * last reference is released. */
+ atom_dec_and_unlock(atom);
+
+ /* lock jnode again for getting valid node->atom pointer
+ * value. */
+ spin_lock_jnode(node);
+ }
+
+ return atom;
+}
+
+/**
+ * Returns true if @node is dirty and part of the same atom as
+ * one of its neighbors. Used by flush code to indicate whether
+ * the next node (in some direction) is suitable for flushing
+ */
+int same_slum_check(jnode *node, jnode *check, int alloc_check, int alloc_value)
+{
+ int compat;
+ txn_atom *atom;
+
+ assert("umka-182", node != NULL);
+ assert("umka-183", check != NULL);
+ /*
+ * Not sure what this function is supposed to do if supplied
+ * with @check that is neither formatted nor unformatted (bitmap
+ * or so)
+ */
+ assert("nikita-2373", jnode_is_znode(check) ||
+ jnode_is_unformatted(check));
+ /*
+ * Need a lock on CHECK to get its atom and to check various state bits.
+ * Don't need a lock on NODE once we get the atom lock.
+ *
+ * It is not enough to lock two nodes and check (node->atom ==
+ * check->atom) because atom could be locked and being fused at that
+ * moment, jnodes of the atom of that state (being fused) can point to
+ * different objects, but the atom is the same.
+ */
+ spin_lock_jnode(check);
+ atom = jnode_get_atom(check);
+
+ if (atom == NULL)
+ compat = 0;
+ else {
+ compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
+ if (compat && jnode_is_znode(check))
+ compat &= znode_is_connected(JZNODE(check));
+ if (compat && alloc_check)
+ compat &= (alloc_value == jnode_is_flushprepped(check));
+ spin_unlock_atom(atom);
+ }
+ spin_unlock_jnode(check);
+ return compat;
+}
+
+/**
+ * Decrement the atom's reference count and if it falls to zero, free it
+ */
+void atom_dec_and_unlock(txn_atom * atom)
+{
+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
+
+ assert("umka-186", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+ assert("zam-1039", atomic_read(&atom->refcount) > 0);
+
+ if (atomic_dec_and_test(&atom->refcount)) {
+ /*
+ * take txnmgr lock and atom lock in proper order
+ */
+ if (!spin_trylock_txnmgr(mgr)) {
+ /*
+ * This atom should exist after we re-acquire its
+ * spinlock, so we increment its reference counter
+ */
+ atomic_inc(&atom->refcount);
+ spin_unlock_atom(atom);
+ spin_lock_txnmgr(mgr);
+ spin_lock_atom(atom);
+
+ if (!atomic_dec_and_test(&atom->refcount)) {
+ spin_unlock_atom(atom);
+ spin_unlock_txnmgr(mgr);
+ return;
+ }
+ }
+ assert_spin_locked(&(mgr->tmgr_lock));
+ free_atom(atom);
+ spin_unlock_txnmgr(mgr);
+ } else
+ spin_unlock_atom(atom);
+}
+
+static txn_atom *__alloc_atom(void)
+{
+ txn_atom *atom;
+
+ atom = kmem_cache_alloc(_atom_slab, reiser4_ctx_gfp_mask_get());
+ if (atom == NULL)
+ return NULL;
+ memset(atom, 0, sizeof(txn_atom));
+ return atom;
+}
+
+static void __free_atom(txn_atom *atom)
+{
+ kmem_cache_free(_atom_slab, atom);
+}
+
+
+/* Create new atom and connect it to given transaction handle. This adds the
+ atom to the transaction manager's list and sets its reference count to 1, an
+ artificial reference which is kept until it commits. We play strange games
+ to avoid allocation under jnode & txnh spinlocks.*/
+
+static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
+{
+ txn_atom *atom;
+ txn_mgr *mgr;
+
+ if (REISER4_DEBUG && sb_rdonly(reiser4_get_current_sb())) {
+ warning("nikita-3366", "Creating atom on read-only fs");
+ dump_stack();
+ }
+ if (*atom_alloc == NULL) {
+ *atom_alloc = __alloc_atom();
+
+ if (*atom_alloc == NULL)
+ return RETERR(-ENOMEM);
+ }
+ /*
+ * and, also, txnmgr spin lock should be taken
+ * before jnode and txnh locks
+ */
+ mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
+ spin_lock_txnmgr(mgr);
+ spin_lock_txnh(txnh);
+
+ /* Check whether new atom still needed */
+ if (txnh->atom != NULL) {
+ /* NOTE-NIKITA probably it is rather better to free
+ * atom_alloc here than thread it up to reiser4_try_capture() */
+
+ spin_unlock_txnh(txnh);
+ spin_unlock_txnmgr(mgr);
+
+ return -E_REPEAT;
+ }
+
+ atom = *atom_alloc;
+ *atom_alloc = NULL;
+
+ init_atom(atom);
+ assert("jmacd-17", atom_isclean(atom));
+ /*
+ * lock ordering is broken here. It is ok, as long as @atom is new
+ * and inaccessible for others. We can't use spin_lock_atom or
+ * spin_lock(&atom->alock) because they care about locking
+ * dependencies. spin_trylock_lock doesn't.
+ */
+ check_me("", spin_trylock_atom(atom));
+
+ /* add atom to the end of transaction manager's list of atoms */
+ list_add_tail(&atom->atom_link, &mgr->atoms_list);
+ atom->atom_id = mgr->id_count++;
+ mgr->atom_count += 1;
+
+ /* Release txnmgr lock */
+ spin_unlock_txnmgr(mgr);
+
+ /* One reference until it commits. */
+ atomic_inc(&atom->refcount);
+ atom->stage = ASTAGE_CAPTURE_FUSE;
+ atom->super = reiser4_get_current_sb();
+ capture_assign_txnh_nolock(atom, txnh);
+
+ spin_unlock_atom(atom);
+ spin_unlock_txnh(txnh);
+
+ return -E_REPEAT;
+}
+
+/**
+ * In some rare cases we need atom to exist before capturing
+ * any nodes
+ */
+int reiser4_create_atom(void)
+{
+ txn_atom *atom_alloc = NULL;
+ txn_handle *txnh = get_current_context()->trans;
+ int ret;
+
+ do {
+ spin_lock_txnh(txnh);
+ if (txnh->atom == NULL) {
+ spin_unlock_txnh(txnh);
+ /*
+ * assign empty atom to the txnh and repeat
+ */
+ ret = atom_begin_and_assign_to_txnh(&atom_alloc, txnh);
+ } else {
+ spin_unlock_txnh(txnh);
+ ret = 0;
+ }
+ } while (ret == -E_REPEAT);
+ return ret;
+}
+
+/* Return true if an atom is currently "open". */
+static int atom_isopen(const txn_atom * atom)
+{
+ assert("umka-185", atom != NULL);
+
+ return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
+}
+
+/**
+ * Return the number of pointers to this atom that must be
+ * updated during fusion. This approximates the amount of work to be done.
+ * Fusion chooses the atom with fewer pointers to fuse into the atom with
+ * more pointers
+ */
+static int atom_pointer_count(const txn_atom * atom)
+{
+ assert("umka-187", atom != NULL);
+ /*
+ * This is a measure of the amount of work needed
+ * to fuse this atom into another one
+ */
+ return atom->txnh_count + atom->capture_count;
+}
+
+/**
+ * Called holding the atom lock, this removes the atom
+ * from the transaction manager list and frees it
+ */
+static void free_atom(txn_atom * atom)
+{
+ txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
+
+ assert("umka-188", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ /* Remove from the txn_mgr's atom list */
+ assert_spin_locked(&(mgr->tmgr_lock));
+ mgr->atom_count -= 1;
+ list_del_init(&atom->atom_link);
+
+ /* Clean the atom */
+ assert("jmacd-16",
+ (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
+ atom->stage = ASTAGE_FREE;
+
+ atom_dset_destroy(atom);
+
+ assert("jmacd-16", atom_isclean(atom));
+
+ done_atom_bricks_info(atom);
+
+ spin_unlock_atom(atom);
+
+ __free_atom(atom);
+}
+
+static int atom_is_dotard(const txn_atom * atom)
+{
+ return time_after(jiffies, atom->start_time +
+ get_current_super_private()->tmgr.atom_max_age);
+}
+
+static int atom_can_be_committed(txn_atom * atom)
+{
+ assert_spin_locked(&(atom->alock));
+ assert("zam-885", atom->txnh_count > atom->nr_waiters);
+ return atom->txnh_count == atom->nr_waiters + 1;
+}
+
+/* Return true if an atom should commit now. This is determined by aging, atom
+ size or atom flags. */
+static int atom_should_commit(const txn_atom * atom)
+{
+ assert("umka-189", atom != NULL);
+ return
+ (atom->flags & ATOM_FORCE_COMMIT) ||
+ ((unsigned)atom_pointer_count(atom) >
+ get_current_super_private()->tmgr.atom_max_size)
+ || atom_is_dotard(atom);
+}
+
+/* return 1 if current atom exists and requires commit. */
+int current_atom_should_commit(void)
+{
+ txn_atom *atom;
+ int result = 0;
+
+ atom = get_current_atom_locked_nocheck();
+ if (atom) {
+ result = atom_should_commit(atom);
+ spin_unlock_atom(atom);
+ }
+ return result;
+}
+
+static int atom_should_commit_asap(const txn_atom * atom)
+{
+ unsigned int captured;
+ unsigned int pinnedpages;
+
+ assert("nikita-3309", atom != NULL);
+
+ captured = (unsigned)atom->capture_count;
+ pinnedpages = (captured >> PAGE_SHIFT) * sizeof(znode);
+
+ return (pinnedpages > (totalram_pages() >> 3)) || (atom->flushed > 100);
+}
+
+static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
+{
+ jnode *first_dirty;
+
+ list_for_each_entry(first_dirty, head, capture_link) {
+ if (!(flags & JNODE_FLUSH_COMMIT)) {
+ /*
+ * skip jnodes which "heard banshee" or having active
+ * I/O
+ */
+ if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
+ JF_ISSET(first_dirty, JNODE_WRITEBACK))
+ continue;
+ }
+ return first_dirty;
+ }
+ return NULL;
+}
+
+/**
+ * Get first dirty node from the atom's dirty_nodes[n] lists;
+ * return NULL if atom has no dirty nodes on atom's lists
+ */
+jnode *find_first_dirty_jnode(txn_atom *atom, int flags)
+{
+ jnode *first_dirty;
+ tree_level level;
+
+ assert_spin_locked(&(atom->alock));
+ /*
+ * The flush starts from LEAF_LEVEL (=1)
+ */
+ for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
+ if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
+ continue;
+
+ first_dirty =
+ find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
+ flags);
+ if (first_dirty)
+ return first_dirty;
+ }
+ /*
+ * znode-above-root is on the list #0
+ */
+ return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
+}
+
+static void dispatch_wb_list(txn_atom *atom, flush_queue_t *fq)
+{
+ jnode *cur;
+
+ assert("zam-905", atom_is_protected(atom));
+
+ cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
+
+ while (ATOM_WB_LIST(atom) != &cur->capture_link) {
+
+ jnode *next = list_entry(cur->capture_link.next,
+ jnode, capture_link);
+ spin_lock_jnode(cur);
+ if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
+ if (JF_ISSET(cur, JNODE_DIRTY)) {
+ queue_jnode(fq, cur);
+ } else {
+ /*
+ * move jnode to atom's clean list
+ */
+ list_move_tail(&cur->capture_link,
+ ATOM_CLEAN_LIST(atom));
+ }
+ }
+ spin_unlock_jnode(cur);
+ cur = next;
+ }
+}
+
+/**
+ * Scan current atom->writeback_nodes list,
+ * re-submit dirty and !writeback jnodes to disk
+ */
+static int submit_wb_list(void)
+{
+ int ret;
+ flush_queue_t *fq;
+
+ fq = get_fq_for_current_atom();
+ if (IS_ERR(fq))
+ return PTR_ERR(fq);
+
+ dispatch_wb_list(fq->atom, fq);
+ spin_unlock_atom(fq->atom);
+
+ ret = reiser4_write_fq(fq, NULL, 1);
+ reiser4_fq_put(fq);
+
+ return ret;
+}
+
+/**
+ * Wait completion of all writes,
+ * re-submit atom writeback list if needed
+ */
+static int current_atom_complete_writes(void)
+{
+ int ret;
+ /*
+ * Each jnode from that list was modified and dirtied when it had i/o
+ * request running already. After i/o completion we have to resubmit
+ * them to disk again
+ */
+ ret = submit_wb_list();
+ if (ret < 0)
+ return ret;
+ /*
+ * Wait all i/o completion
+ */
+ ret = current_atom_finish_all_fq();
+ if (ret)
+ return ret;
+ /*
+ * Scan wb list again; all i/o should be completed, we re-submit dirty
+ * nodes to disk
+ */
+ ret = submit_wb_list();
+ if (ret < 0)
+ return ret;
+ /*
+ * Wait all nodes we just submitted
+ */
+ return current_atom_finish_all_fq();
+}
+
+#if REISER4_DEBUG
+
+static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
+{
+ if (atom == NULL) {
+ printk("%s: no atom\n", prefix);
+ return;
+ }
+
+ printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
+ " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
+ atomic_read(&atom->refcount), atom->atom_id, atom->flags,
+ atom->txnh_count, atom->capture_count, atom->stage,
+ atom->start_time, atom->flushed);
+}
+#else /* REISER4_DEBUG */
+static inline void reiser4_info_atom(const char *prefix,
+ const txn_atom *atom) {}
+#endif /* REISER4_DEBUG */
+
+#define TOOMANYFLUSHES (1 << 13)
+
+/* Called with the atom locked and no open "active" transaction handlers except
+ ours, this function calls flush_current_atom() until all dirty nodes are
+ processed. Then it initiates commit processing.
+
+ Called by the single remaining open "active" txnh, which is closing. Other
+ open txnhs belong to processes which wait atom commit in commit_txnh()
+ routine. They are counted as "waiters" in atom->nr_waiters. Therefore as
+ long as we hold the atom lock none of the jnodes can be captured and/or
+ locked.
+
+ Return value is an error code if commit fails.
+*/
+static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
+{
+ long ret = 0;
+ reiser4_super_info_data *sbinfo = get_current_super_private();
+ int flushiters; /* how many times jnode_flush() was called as a part
+ of attempt to commit this atom. */
+
+ assert("zam-888", atom != NULL && *atom != NULL);
+ assert_spin_locked(&((*atom)->alock));
+ assert("zam-887", get_current_context()->trans->atom == *atom);
+ assert("jmacd-151", atom_isopen(*atom));
+
+ assert("nikita-3184",
+ get_current_super_private()->delete_mutex_owner != current);
+
+ for (flushiters = 0;; ++flushiters) {
+ ret =
+ flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
+ JNODE_FLUSH_COMMIT,
+ LONG_MAX /* nr_to_write */ ,
+ nr_submitted, atom, NULL);
+ if (ret != -E_REPEAT)
+ break;
+ /*
+ * if atom's dirty list contains one znode which is
+ * HEARD_BANSHEE and is locked we have to allow lock
+ * owner to continue and uncapture that znode
+ */
+ reiser4_preempt_point();
+
+ *atom = get_current_atom_locked();
+
+ if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
+ warning("nikita-3176",
+ "Flushing like mad: %i", flushiters);
+ reiser4_info_atom("atom", *atom);
+ DEBUGON(flushiters > (1 << 20));
+ }
+ }
+ if (ret)
+ return ret;
+ assert_spin_locked(&((*atom)->alock));
+
+ if (!atom_can_be_committed(*atom)) {
+ spin_unlock_atom(*atom);
+ return RETERR(-E_REPEAT);
+ }
+ if ((*atom)->capture_count == 0)
+ goto done;
+ /*
+ * Up to this point we have been flushing and after flush is called we
+ * return -E_REPEAT. Now we can commit. We cannot return -E_REPEAT
+ * at this point, commit should be successful
+ */
+ reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
+ ON_DEBUG(((*atom)->committer = current));
+ spin_unlock_atom(*atom);
+
+ ret = current_atom_complete_writes();
+ if (ret)
+ return ret;
+
+ assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
+ /*
+ * isolate critical code path which should be executed by only one
+ * thread using tmgr mutex
+ */
+ mutex_lock(&sbinfo->tmgr.commit_mutex);
+
+ ret = reiser4_write_logs(nr_submitted);
+ if (ret < 0)
+ reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
+ /*
+ * The atom->ovrwr_nodes list is processed under commit mutex held
+ * because of bitmap nodes which are captured by special way in
+ * reiser4_pre_commit_hook_bitmap(), that way does not include
+ * capture_fuse_wait() as a capturing of other nodes does -- the commit
+ * mutex is used for transaction isolation instead
+ */
+ assert("edward-1774", list_empty(ATOM_OVRWR_LIST(*atom)));
+
+ mutex_unlock(&sbinfo->tmgr.commit_mutex);
+
+ reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
+ reiser4_invalidate_list(ATOM_WB_LIST(*atom));
+ assert("zam-927", list_empty(&(*atom)->inodes));
+
+ spin_lock_atom(*atom);
+ done:
+ reiser4_atom_set_stage(*atom, ASTAGE_DONE);
+ ON_DEBUG((*atom)->committer = NULL);
+
+ /* Atom's state changes, so wake up everybody waiting for this
+ event. */
+ wakeup_atom_waiting_list(*atom);
+ /*
+ * Decrement the "until commit" reference, at least one txnh
+ * (the caller) is still open
+ */
+ atomic_dec(&(*atom)->refcount);
+
+ assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
+ assert("jmacd-1062", (*atom)->capture_count == 0);
+ BUG_ON((*atom)->capture_count != 0);
+ assert_spin_locked(&((*atom)->alock));
+
+ return ret;
+}
+
+/* TXN_TXNH */
+
+/**
+ * force_commit_atom - commit current atom and wait commit completion
+ * @txnh:
+ *
+ * Commits current atom and wait commit completion; current atom and @txnh have
+ * to be spinlocked before call, this function unlocks them on exit.
+ */
+int force_commit_atom(txn_handle *txnh)
+{
+ txn_atom *atom;
+
+ assert("zam-837", txnh != NULL);
+ assert_spin_locked(&(txnh->hlock));
+ assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
+
+ atom = txnh->atom;
+
+ assert("zam-834", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ /*
+ * Set flags for atom and txnh: forcing atom commit and waiting for
+ * commit completion
+ */
+ txnh->flags |= TXNH_WAIT_COMMIT;
+ atom->flags |= ATOM_FORCE_COMMIT;
+
+ spin_unlock_txnh(txnh);
+ spin_unlock_atom(atom);
+
+ /* commit is here */
+ reiser4_txn_restart_current();
+ return 0;
+}
+
+int force_commit_current_atom(void)
+{
+ txn_atom *atom;
+ txn_handle *th;
+
+ th = get_current_context()->trans;
+ atom = get_current_atom_locked();
+ assert("vpf-1906", atom != NULL);
+ spin_lock_txnh(th);
+ return force_commit_atom(th);
+}
+
+/* Called to force commit of any outstanding atoms. @commit_all_atoms controls
+ * should we commit all atoms including new ones which are created after this
+ * functions is called. */
+int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
+{
+ int ret;
+ txn_atom *atom;
+ txn_mgr *mgr;
+ txn_handle *txnh;
+ unsigned long start_time = jiffies;
+ reiser4_context *ctx = get_current_context();
+
+ assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
+ assert("nikita-3058", reiser4_commit_check_locks());
+
+ reiser4_txn_restart_current();
+
+ mgr = &get_super_private(super)->tmgr;
+
+ txnh = ctx->trans;
+
+ again:
+
+ spin_lock_txnmgr(mgr);
+
+ list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
+ spin_lock_atom(atom);
+
+ /* Commit any atom which can be committed. If @commit_new_atoms
+ * is not set we commit only atoms which were created before
+ * this call is started. */
+ if (commit_all_atoms
+ || time_before_eq(atom->start_time, start_time)) {
+ if (atom->stage <= ASTAGE_POST_COMMIT) {
+ spin_unlock_txnmgr(mgr);
+
+ if (atom->stage < ASTAGE_PRE_COMMIT) {
+ spin_lock_txnh(txnh);
+ /* Add force-context txnh */
+ capture_assign_txnh_nolock(atom, txnh);
+ ret = force_commit_atom(txnh);
+ if (ret)
+ return ret;
+ } else
+ /* wait atom commit */
+ reiser4_atom_wait_event(atom);
+
+ goto again;
+ }
+ }
+
+ spin_unlock_atom(atom);
+ }
+ assert("edward-2273",
+ ergo(commit_all_atoms,
+ reiser4_volume_fake_allocated(super) == 0));
+
+ spin_unlock_txnmgr(mgr);
+ return 0;
+}
+
+/* check whether commit_some_atoms() can commit @atom. Locking is up to the
+ * caller */
+static int atom_is_committable(txn_atom * atom)
+{
+ return
+ atom->stage < ASTAGE_PRE_COMMIT &&
+ atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
+}
+
+/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
+ * lock at exit */
+int commit_some_atoms(txn_mgr * mgr)
+{
+ int ret = 0;
+ txn_atom *atom;
+ txn_handle *txnh;
+ reiser4_context *ctx;
+ struct list_head *pos, *tmp;
+
+ ctx = get_current_context();
+ assert("nikita-2444", ctx != NULL);
+
+ txnh = ctx->trans;
+ spin_lock_txnmgr(mgr);
+
+ /*
+ * this is to avoid gcc complain that atom might be used
+ * uninitialized
+ */
+ atom = NULL;
+
+ /* look for atom to commit */
+ list_for_each_safe(pos, tmp, &mgr->atoms_list) {
+ atom = list_entry(pos, txn_atom, atom_link);
+ /*
+ * first test without taking atom spin lock, whether it is
+ * eligible for committing at all
+ */
+ if (atom_is_committable(atom)) {
+ /* now, take spin lock and re-check */
+ spin_lock_atom(atom);
+ if (atom_is_committable(atom))
+ break;
+ spin_unlock_atom(atom);
+ }
+ }
+
+ ret = (&mgr->atoms_list == pos);
+ spin_unlock_txnmgr(mgr);
+
+ if (ret) {
+ /* nothing found */
+ spin_unlock(&mgr->daemon->guard);
+ return 0;
+ }
+
+ spin_lock_txnh(txnh);
+
+ BUG_ON(atom == NULL);
+ /* Set the atom to force committing */
+ atom->flags |= ATOM_FORCE_COMMIT;
+
+ /* Add force-context txnh */
+ capture_assign_txnh_nolock(atom, txnh);
+
+ spin_unlock_txnh(txnh);
+ spin_unlock_atom(atom);
+
+ /* we are about to release daemon spin lock, notify daemon it
+ has to rescan atoms */
+ mgr->daemon->rescan = 1;
+ spin_unlock(&mgr->daemon->guard);
+ reiser4_txn_restart_current();
+ return 0;
+}
+
+static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
+{
+ int atom_stage;
+ txn_atom *atom_2;
+ int repeat;
+
+ assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
+
+ atom_stage = atom->stage;
+ repeat = 0;
+
+ if (!spin_trylock_txnmgr(tmgr)) {
+ atomic_inc(&atom->refcount);
+ spin_unlock_atom(atom);
+ spin_lock_txnmgr(tmgr);
+ spin_lock_atom(atom);
+ repeat = 1;
+ if (atom->stage != atom_stage) {
+ spin_unlock_txnmgr(tmgr);
+ atom_dec_and_unlock(atom);
+ return -E_REPEAT;
+ }
+ atomic_dec(&atom->refcount);
+ }
+
+ list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
+ if (atom == atom_2)
+ continue;
+ /*
+ * if trylock does not succeed we just do not fuse with that
+ * atom.
+ */
+ if (spin_trylock_atom(atom_2)) {
+ if (atom_2->stage < ASTAGE_PRE_COMMIT) {
+ spin_unlock_txnmgr(tmgr);
+ capture_fuse_into(atom_2, atom);
+ /* all locks are lost we can only repeat here */
+ return -E_REPEAT;
+ }
+ spin_unlock_atom(atom_2);
+ }
+ }
+ atom->flags |= ATOM_CANCEL_FUSION;
+ spin_unlock_txnmgr(tmgr);
+ if (repeat) {
+ spin_unlock_atom(atom);
+ return -E_REPEAT;
+ }
+ return 0;
+}
+
+/* Calls jnode_flush for current atom if it exists; if not, just take another
+ atom and call jnode_flush() for him. If current transaction handle has
+ already assigned atom (current atom) we have to close current transaction
+ prior to switch to another atom or do something with current atom. This
+ code tries to flush current atom.
+
+ flush_some_atom() is called as part of memory clearing process. It is
+ invoked from balance_dirty_pages(), pdflushd, and entd.
+
+ If we can flush no nodes, atom is committed, because this frees memory.
+
+ If atom is too large or too old it is committed also.
+*/
+int
+flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
+ int flags)
+{
+ reiser4_context *ctx = get_current_context();
+ txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
+ txn_handle *txnh = ctx->trans;
+ txn_atom *atom;
+ int ret;
+
+ BUG_ON(wbc->nr_to_write == 0);
+ BUG_ON(*nr_submitted != 0);
+ assert("zam-1042", txnh != NULL);
+repeat:
+ if (txnh->atom == NULL) {
+ /* current atom is not available, take first from txnmgr */
+ spin_lock_txnmgr(tmgr);
+
+ /* traverse the list of all atoms */
+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
+ /* lock atom before checking its state */
+ spin_lock_atom(atom);
+ /*
+ * we need an atom which is not being committed and
+ * which has no flushers (jnode_flush() add one flusher
+ * at the beginning and subtract one at the end).
+ */
+ if (atom->stage < ASTAGE_PRE_COMMIT &&
+ atom->nr_flushers == 0) {
+ spin_lock_txnh(txnh);
+ capture_assign_txnh_nolock(atom, txnh);
+ spin_unlock_txnh(txnh);
+
+ goto found;
+ }
+
+ spin_unlock_atom(atom);
+ }
+
+ /*
+ * Write throttling is case of no one atom can be
+ * flushed/committed. */
+ if (!ctx->flush_bd_task) {
+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
+ spin_lock_atom(atom);
+ /* Repeat the check from the above. */
+ if (atom->stage < ASTAGE_PRE_COMMIT
+ && atom->nr_flushers == 0) {
+ spin_lock_txnh(txnh);
+ capture_assign_txnh_nolock(atom, txnh);
+ spin_unlock_txnh(txnh);
+
+ goto found;
+ }
+ if (atom->stage <= ASTAGE_POST_COMMIT) {
+ spin_unlock_txnmgr(tmgr);
+ /*
+ * we just wait until atom's flusher
+ * makes a progress in flushing or
+ * committing the atom
+ */
+ reiser4_atom_wait_event(atom);
+ goto repeat;
+ }
+ spin_unlock_atom(atom);
+ }
+ }
+ spin_unlock_txnmgr(tmgr);
+ return 0;
+ found:
+ spin_unlock_txnmgr(tmgr);
+ } else
+ atom = get_current_atom_locked();
+
+ BUG_ON(atom->super != ctx->super);
+ assert("vs-35", atom->super == ctx->super);
+ if (start) {
+ spin_lock_jnode(start);
+ ret = (atom == start->atom) ? 1 : 0;
+ spin_unlock_jnode(start);
+ if (ret == 0)
+ start = NULL;
+ }
+ ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
+ if (ret == 0) {
+ /* flush_current_atom returns 0 only if it submitted for write
+ nothing */
+ BUG_ON(*nr_submitted != 0);
+ if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
+ if (atom->capture_count < tmgr->atom_min_size &&
+ !(atom->flags & ATOM_CANCEL_FUSION)) {
+ ret = txn_try_to_fuse_small_atom(tmgr, atom);
+ if (ret == -E_REPEAT) {
+ reiser4_preempt_point();
+ goto repeat;
+ }
+ }
+ /* if early flushing could not make more nodes clean,
+ * or atom is too old/large,
+ * we force current atom to commit */
+ /* wait for commit completion but only if this
+ * wouldn't stall pdflushd and ent thread. */
+ if (!ctx->entd)
+ txnh->flags |= TXNH_WAIT_COMMIT;
+ atom->flags |= ATOM_FORCE_COMMIT;
+ }
+ spin_unlock_atom(atom);
+ } else if (ret == -E_REPEAT) {
+ if (*nr_submitted == 0) {
+ /* let others who hampers flushing (hold longterm locks,
+ for instance) to free the way for flush */
+ reiser4_preempt_point();
+ goto repeat;
+ }
+ ret = 0;
+ }
+/*
+ if (*nr_submitted > wbc->nr_to_write)
+ warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
+*/
+ reiser4_txn_restart(ctx);
+
+ return ret;
+}
+
+/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
+void reiser4_invalidate_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ jnode *node;
+
+ node = list_entry(head->next, jnode, capture_link);
+ spin_lock_jnode(node);
+ reiser4_uncapture_block(node);
+ jput(node);
+ }
+}
+
+static void init_wlinks(txn_wait_links * wlinks)
+{
+ wlinks->_lock_stack = get_current_lock_stack();
+ INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
+ INIT_LIST_HEAD(&wlinks->_fwaiting_link);
+ wlinks->waitfor_cb = NULL;
+ wlinks->waiting_cb = NULL;
+}
+
+/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
+void reiser4_atom_wait_event(txn_atom * atom)
+{
+ txn_wait_links _wlinks;
+
+ assert_spin_locked(&(atom->alock));
+ assert("nikita-3156",
+ lock_stack_isclean(get_current_lock_stack()) ||
+ atom->nr_running_queues > 0);
+
+ init_wlinks(&_wlinks);
+ list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
+ atomic_inc(&atom->refcount);
+ spin_unlock_atom(atom);
+
+ reiser4_prepare_to_sleep(_wlinks._lock_stack);
+ reiser4_go_to_sleep(_wlinks._lock_stack);
+
+ spin_lock_atom(atom);
+ list_del(&_wlinks._fwaitfor_link);
+ atom_dec_and_unlock(atom);
+}
+
+void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
+{
+ assert("nikita-3535", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+ assert("nikita-3536", stage <= ASTAGE_INVALID);
+ /* Excelsior! */
+ assert("nikita-3537", stage >= atom->stage);
+ if (atom->stage != stage) {
+ atom->stage = stage;
+ reiser4_atom_send_event(atom);
+ }
+}
+
+/* wake all threads which wait for an event */
+void reiser4_atom_send_event(txn_atom * atom)
+{
+ assert_spin_locked(&(atom->alock));
+ wakeup_atom_waitfor_list(atom);
+}
+
+/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
+ example, because it does fsync(2)) */
+static int should_wait_commit(txn_handle * h)
+{
+ return h->flags & TXNH_WAIT_COMMIT;
+}
+
+typedef struct commit_data {
+ txn_atom *atom;
+ txn_handle *txnh;
+ long nr_written;
+ /* as an optimization we start committing atom by first trying to
+ * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
+ * allows to reduce stalls due to other threads waiting for atom in
+ * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
+ * preliminary flushes. */
+ int preflush;
+ /* have we waited on atom. */
+ int wait;
+ int failed;
+ int wake_ktxnmgrd_up;
+} commit_data;
+
+/*
+ * Called from commit_txnh() repeatedly, until either error happens, or atom
+ * commits successfully.
+ */
+static int try_commit_txnh(commit_data * cd)
+{
+ int result;
+
+ assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
+
+ /* Get the atom and txnh locked. */
+ cd->atom = txnh_get_atom(cd->txnh);
+ assert("jmacd-309", cd->atom != NULL);
+ spin_unlock_txnh(cd->txnh);
+
+ if (cd->wait) {
+ cd->atom->nr_waiters--;
+ cd->wait = 0;
+ }
+
+ if (cd->atom->stage == ASTAGE_DONE)
+ return 0;
+
+ if (cd->failed)
+ return 0;
+
+ if (atom_should_commit(cd->atom)) {
+ /* if atom is _very_ large schedule it for commit as soon as
+ * possible. */
+ if (atom_should_commit_asap(cd->atom)) {
+ /*
+ * When atom is in PRE_COMMIT or later stage following
+ * invariant (encoded in atom_can_be_committed())
+ * holds: there is exactly one non-waiter transaction
+ * handle opened on this atom. When thread wants to
+ * wait until atom commits (for example sync()) it
+ * waits on atom event after increasing
+ * atom->nr_waiters (see blow in this function). It
+ * cannot be guaranteed that atom is already committed
+ * after receiving event, so loop has to be
+ * re-started. But if atom switched into PRE_COMMIT
+ * stage and became too large, we cannot change its
+ * state back to CAPTURE_WAIT (atom stage can only
+ * increase monotonically), hence this check.
+ */
+ if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
+ reiser4_atom_set_stage(cd->atom,
+ ASTAGE_CAPTURE_WAIT);
+ cd->atom->flags |= ATOM_FORCE_COMMIT;
+ }
+ if (cd->txnh->flags & TXNH_DONT_COMMIT) {
+ /*
+ * this thread (transaction handle that is) doesn't
+ * want to commit atom. Notify waiters that handle is
+ * closed. This can happen, for example, when we are
+ * under VFS directory lock and don't want to commit
+ * atom right now to avoid stalling other threads
+ * working in the same directory.
+ */
+
+ /* Wake the ktxnmgrd up if the ktxnmgrd is needed to
+ * commit this atom: no atom waiters and only one
+ * (our) open transaction handle. */
+ cd->wake_ktxnmgrd_up =
+ cd->atom->txnh_count == 1 &&
+ cd->atom->nr_waiters == 0;
+ reiser4_atom_send_event(cd->atom);
+ result = 0;
+ } else if (!atom_can_be_committed(cd->atom)) {
+ if (should_wait_commit(cd->txnh)) {
+ /* sync(): wait for commit */
+ cd->atom->nr_waiters++;
+ cd->wait = 1;
+ reiser4_atom_wait_event(cd->atom);
+ result = RETERR(-E_REPEAT);
+ } else {
+ result = 0;
+ }
+ } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
+ /*
+ * optimization: flush atom without switching it into
+ * ASTAGE_CAPTURE_WAIT.
+ *
+ * But don't do this for ktxnmgrd, because ktxnmgrd
+ * should never block on atom fusion.
+ */
+ result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
+ LONG_MAX, &cd->nr_written,
+ &cd->atom, NULL);
+ if (result == 0) {
+ spin_unlock_atom(cd->atom);
+ cd->preflush = 0;
+ result = RETERR(-E_REPEAT);
+ } else /* Atoms wasn't flushed
+ * completely. Rinse. Repeat. */
+ --cd->preflush;
+ } else {
+ /* We change atom state to ASTAGE_CAPTURE_WAIT to
+ prevent atom fusion and count ourself as an active
+ flusher */
+ reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
+ cd->atom->flags |= ATOM_FORCE_COMMIT;
+
+ result =
+ commit_current_atom(&cd->nr_written, &cd->atom);
+ if (result != 0 && result != -E_REPEAT)
+ cd->failed = 1;
+ }
+ } else
+ result = 0;
+
+#if REISER4_DEBUG
+ if (result == 0)
+ assert_spin_locked(&(cd->atom->alock));
+#endif
+
+ /* perfectly valid assertion, except that when atom/txnh is not locked
+ * fusion can take place, and cd->atom points nowhere. */
+ /*
+ assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
+ */
+ return result;
+}
+
+/* Called to commit a transaction handle. This decrements the atom's number of open
+ handles and if it is the last handle to commit and the atom should commit, initiates
+ atom commit. if commit does not fail, return number of written blocks */
+static int commit_txnh(txn_handle * txnh)
+{
+ commit_data cd;
+ assert("umka-192", txnh != NULL);
+
+ memset(&cd, 0, sizeof cd);
+ cd.txnh = txnh;
+ cd.preflush = 10;
+
+ /* calls try_commit_txnh() until either atom commits, or error
+ * happens */
+ while (try_commit_txnh(&cd) != 0)
+ reiser4_preempt_point();
+
+ spin_lock_txnh(txnh);
+
+ cd.atom->txnh_count -= 1;
+ txnh->atom = NULL;
+ /* remove transaction handle from atom's list of transaction handles */
+ list_del_init(&txnh->txnh_link);
+
+ spin_unlock_txnh(txnh);
+ atom_dec_and_unlock(cd.atom);
+ /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
+ * because it takes time) by current thread, we do that work
+ * asynchronously by ktxnmgrd daemon. */
+ if (cd.wake_ktxnmgrd_up)
+ ktxnmgrd_kick(&get_current_super_private()->tmgr);
+
+ return 0;
+}
+
+/* TRY_CAPTURE */
+
+/* This routine attempts a single block-capture request. It may return -E_REPEAT if some
+ condition indicates that the request should be retried, and it may block if the
+ txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
+
+ This routine encodes the basic logic of block capturing described by:
+
+ http://namesys.com/v4/v4.html
+
+ Our goal here is to ensure that any two blocks that contain dependent modifications
+ should commit at the same time. This function enforces this discipline by initiating
+ fusion whenever a transaction handle belonging to one atom requests to read or write a
+ block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
+
+ In addition, this routine handles the initial assignment of atoms to blocks and
+ transaction handles. These are possible outcomes of this function:
+
+ 1. The block and handle are already part of the same atom: return immediate success
+
+ 2. The block is assigned but the handle is not: call capture_assign_txnh to assign
+ the handle to the block's atom.
+
+ 3. The handle is assigned but the block is not: call capture_assign_block to assign
+ the block to the handle's atom.
+
+ 4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
+ to fuse atoms.
+
+ 5. Neither block nor handle are assigned: create a new atom and assign them both.
+
+ 6. A read request for a non-captured block: return immediate success.
+
+ This function acquires and releases the handle's spinlock. This function is called
+ under the jnode lock and if the return value is 0, it returns with the jnode lock still
+ held. If the return is -E_REPEAT or some other error condition, the jnode lock is
+ released. The external interface (reiser4_try_capture) manages re-aquiring the jnode
+ lock in the failure case.
+*/
+int try_capture_block(
+ txn_handle * txnh, jnode * node, txn_capture mode,
+ txn_atom ** atom_alloc)
+{
+ txn_atom *block_atom;
+ txn_atom *txnh_atom;
+
+ /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
+ assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
+
+ /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
+ * node->tree somewhere. */
+ assert("umka-194", txnh != NULL);
+ assert("umka-195", node != NULL);
+
+ /* The jnode is already locked! Being called from reiser4_try_capture(). */
+ assert_spin_locked(&(node->guard));
+ block_atom = node->atom;
+
+ /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
+ let us touch the atoms themselves. */
+ spin_lock_txnh(txnh);
+ txnh_atom = txnh->atom;
+ /* Process of capturing continues into one of four branches depends on
+ which atoms from (block atom (node->atom), current atom (txnh->atom))
+ exist. */
+ if (txnh_atom == NULL) {
+ if (block_atom == NULL) {
+ spin_unlock_txnh(txnh);
+ spin_unlock_jnode(node);
+ /* assign empty atom to the txnh and repeat */
+ return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
+ } else {
+ atomic_inc(&block_atom->refcount);
+ /* node spin-lock isn't needed anymore */
+ spin_unlock_jnode(node);
+ if (!spin_trylock_atom(block_atom)) {
+ spin_unlock_txnh(txnh);
+ spin_lock_atom(block_atom);
+ spin_lock_txnh(txnh);
+ }
+ /* re-check state after getting txnh and the node
+ * atom spin-locked */
+ if (node->atom != block_atom || txnh->atom != NULL) {
+ spin_unlock_txnh(txnh);
+ atom_dec_and_unlock(block_atom);
+ return RETERR(-E_REPEAT);
+ }
+ atomic_dec(&block_atom->refcount);
+ if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
+ (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
+ block_atom->txnh_count != 0))
+ return capture_fuse_wait(txnh, block_atom, NULL, mode);
+ capture_assign_txnh_nolock(block_atom, txnh);
+ spin_unlock_txnh(txnh);
+ spin_unlock_atom(block_atom);
+ return RETERR(-E_REPEAT);
+ }
+ } else {
+ /* It is time to perform deadlock prevention check over the
+ node we want to capture. It is possible this node was locked
+ for read without capturing it. The optimization which allows
+ to do it helps us in keeping atoms independent as long as
+ possible but it may cause lock/fuse deadlock problems.
+
+ A number of similar deadlock situations with locked but not
+ captured nodes were found. In each situation there are two
+ or more threads: one of them does flushing while another one
+ does routine balancing or tree lookup. The flushing thread
+ (F) sleeps in long term locking request for node (N), another
+ thread (A) sleeps in trying to capture some node already
+ belonging the atom F, F has a state which prevents
+ immediately fusion .
+
+ Deadlocks of this kind cannot happen if node N was properly
+ captured by thread A. The F thread fuse atoms before locking
+ therefore current atom of thread F and current atom of thread
+ A became the same atom and thread A may proceed. This does
+ not work if node N was not captured because the fusion of
+ atom does not happens.
+
+ The following scheme solves the deadlock: If
+ longterm_lock_znode locks and does not capture a znode, that
+ znode is marked as MISSED_IN_CAPTURE. A node marked this way
+ is processed by the code below which restores the missed
+ capture and fuses current atoms of all the node lock owners
+ by calling the fuse_not_fused_lock_owners() function. */
+ if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
+ JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
+ if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
+ spin_unlock_txnh(txnh);
+ spin_unlock_jnode(node);
+ fuse_not_fused_lock_owners(txnh, JZNODE(node));
+ return RETERR(-E_REPEAT);
+ }
+ }
+ if (block_atom == NULL) {
+ atomic_inc(&txnh_atom->refcount);
+ spin_unlock_txnh(txnh);
+ if (!spin_trylock_atom(txnh_atom)) {
+ spin_unlock_jnode(node);
+ spin_lock_atom(txnh_atom);
+ spin_lock_jnode(node);
+ }
+ if (txnh->atom != txnh_atom || node->atom != NULL
+ || JF_ISSET(node, JNODE_IS_DYING)) {
+ spin_unlock_jnode(node);
+ atom_dec_and_unlock(txnh_atom);
+ return RETERR(-E_REPEAT);
+ }
+ atomic_dec(&txnh_atom->refcount);
+ capture_assign_block_nolock(txnh_atom, node);
+ spin_unlock_atom(txnh_atom);
+ } else {
+ if (txnh_atom != block_atom) {
+ if (mode & TXN_CAPTURE_DONT_FUSE) {
+ spin_unlock_txnh(txnh);
+ spin_unlock_jnode(node);
+ /* we are in a "no-fusion" mode and @node is
+ * already part of transaction. */
+ return RETERR(-E_NO_NEIGHBOR);
+ }
+ return capture_init_fusion(node, txnh, mode);
+ }
+ spin_unlock_txnh(txnh);
+ }
+ }
+ return 0;
+}
+
+static txn_capture
+build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
+{
+ txn_capture cap_mode;
+
+ assert_spin_locked(&(node->guard));
+
+ /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
+
+ if (lock_mode == ZNODE_WRITE_LOCK) {
+ cap_mode = TXN_CAPTURE_WRITE;
+ } else if (node->atom != NULL) {
+ cap_mode = TXN_CAPTURE_WRITE;
+ } else if (0 && /* txnh->mode == TXN_READ_FUSING && */
+ jnode_get_level(node) == LEAF_LEVEL) {
+ /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
+ /* We only need a READ_FUSING capture at the leaf level. This
+ is because the internal levels of the tree (twigs included)
+ are redundant from the point of the user that asked for a
+ read-fusing transcrash. The user only wants to read-fuse
+ atoms due to reading uncommitted data that another user has
+ written. It is the file system that reads/writes the
+ internal tree levels, the user only reads/writes leaves. */
+ cap_mode = TXN_CAPTURE_READ_ATOMIC;
+ } else {
+ /* In this case (read lock at a non-leaf) there's no reason to
+ * capture. */
+ /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
+ return 0;
+ }
+
+ cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
+ assert("nikita-3186", cap_mode != 0);
+ return cap_mode;
+}
+
+/* This is an external interface to try_capture_block(), it calls
+ try_capture_block() repeatedly as long as -E_REPEAT is returned.
+
+ @node: node to capture,
+ @lock_mode: read or write lock is used in capture mode calculation,
+ @flags: see txn_capture flags enumeration,
+ @can_coc : can copy-on-capture
+
+ @return: 0 - node was successfully captured, -E_REPEAT - capture request
+ cannot be processed immediately as it was requested in flags,
+ < 0 - other errors.
+*/
+int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
+ txn_capture flags)
+{
+ txn_atom *atom_alloc = NULL;
+ txn_capture cap_mode;
+ txn_handle *txnh = get_current_context()->trans;
+ int ret;
+
+ assert_spin_locked(&(node->guard));
+
+ repeat:
+ if (JF_ISSET(node, JNODE_IS_DYING))
+ return RETERR(-EINVAL);
+ if (node->atom != NULL && txnh->atom == node->atom)
+ return 0;
+ cap_mode = build_capture_mode(node, lock_mode, flags);
+ if (cap_mode == 0 ||
+ (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
+ /* Mark this node as "MISSED". It helps in further deadlock
+ * analysis */
+ if (jnode_is_znode(node))
+ JF_SET(node, JNODE_MISSED_IN_CAPTURE);
+ return 0;
+ }
+ /* Repeat try_capture as long as -E_REPEAT is returned. */
+ ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
+ /* Regardless of non_blocking:
+
+ If ret == 0 then jnode is still locked.
+ If ret != 0 then jnode is unlocked.
+ */
+#if REISER4_DEBUG
+ if (ret == 0)
+ assert_spin_locked(&(node->guard));
+ else
+ assert_spin_not_locked(&(node->guard));
+#endif
+ assert_spin_not_locked(&(txnh->guard));
+
+ if (ret == -E_REPEAT) {
+ /* E_REPEAT implies all locks were released, therefore we need
+ to take the jnode's lock again. */
+ spin_lock_jnode(node);
+
+ /* Although this may appear to be a busy loop, it is not.
+ There are several conditions that cause E_REPEAT to be
+ returned by the call to try_capture_block, all cases
+ indicating some kind of state change that means you should
+ retry the request and will get a different result. In some
+ cases this could be avoided with some extra code, but
+ generally it is done because the necessary locks were
+ released as a result of the operation and repeating is the
+ simplest thing to do (less bug potential). The cases are:
+ atom fusion returns E_REPEAT after it completes (jnode and
+ txnh were unlocked); race conditions in assign_block,
+ assign_txnh, and init_fusion return E_REPEAT (trylock
+ failure); after going to sleep in capture_fuse_wait
+ (request was blocked but may now succeed). I'm not quite
+ sure how capture_copy works yet, but it may also return
+ E_REPEAT. When the request is legitimately blocked, the
+ requestor goes to sleep in fuse_wait, so this is not a busy
+ loop. */
+ /* NOTE-NIKITA: still don't understand:
+
+ try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
+
+ looks like busy loop?
+ */
+ goto repeat;
+ }
+ /*
+ * free extra atom object that was possibly allocated by
+ * try_capture_block().
+ *
+ * Do this before acquiring jnode spin lock to
+ * minimize time spent under lock. --nikita
+ */
+ if (atom_alloc != NULL)
+ __free_atom(atom_alloc);
+ if (ret != 0) {
+ if (ret == -E_BLOCK) {
+ assert("nikita-3360",
+ cap_mode & TXN_CAPTURE_NONBLOCKING);
+ ret = -E_REPEAT;
+ }
+
+ /* Failure means jnode is not locked. FIXME_LATER_JMACD May
+ want to fix the above code to avoid releasing the lock and
+ re-acquiring it, but there are cases were failure occurs
+ when the lock is not held, and those cases would need to be
+ modified to re-take the lock. */
+ spin_lock_jnode(node);
+ }
+
+ /* Jnode is still locked. */
+ assert_spin_locked(&(node->guard));
+ return ret;
+}
+
+static void release_two_atoms(txn_atom *one, txn_atom *two)
+{
+ spin_unlock_atom(one);
+ atom_dec_and_unlock(two);
+ spin_lock_atom(one);
+ atom_dec_and_unlock(one);
+}
+
+/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
+ returned by that routine. The txn_capture request mode is computed here depending on
+ the transaction handle's type and the lock request. This is called from the depths of
+ the lock manager with the jnode lock held and it always returns with the jnode lock
+ held.
+*/
+
+/* fuse all 'active' atoms of lock owners of given node. */
+static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
+{
+ lock_handle *lh;
+ int repeat;
+ txn_atom *atomh, *atomf;
+ reiser4_context *me = get_current_context();
+ reiser4_context *ctx = NULL;
+
+ assert_spin_not_locked(&(ZJNODE(node)->guard));
+ assert_spin_not_locked(&(txnh->hlock));
+
+ repeat:
+ repeat = 0;
+ atomh = txnh_get_atom(txnh);
+ spin_unlock_txnh(txnh);
+ assert("zam-692", atomh != NULL);
+
+ spin_lock_zlock(&node->lock);
+ /* inspect list of lock owners */
+ list_for_each_entry(lh, &node->lock.owners, owners_link) {
+ ctx = get_context_by_lock_stack(lh->owner);
+ if (ctx == me)
+ continue;
+ /* below we use two assumptions to avoid addition spin-locks
+ for checking the condition :
+
+ 1) if the lock stack has lock, the transaction should be
+ opened, i.e. ctx->trans != NULL;
+
+ 2) reading of well-aligned ctx->trans->atom is atomic, if it
+ equals to the address of spin-locked atomh, we take that
+ the atoms are the same, nothing has to be captured. */
+ if (atomh != ctx->trans->atom) {
+ reiser4_wake_up(lh->owner);
+ repeat = 1;
+ break;
+ }
+ }
+ if (repeat) {
+ if (!spin_trylock_txnh(ctx->trans)) {
+ spin_unlock_zlock(&node->lock);
+ spin_unlock_atom(atomh);
+ goto repeat;
+ }
+ atomf = ctx->trans->atom;
+ if (atomf == NULL) {
+ capture_assign_txnh_nolock(atomh, ctx->trans);
+ /* release zlock lock _after_ assigning the atom to the
+ * transaction handle, otherwise the lock owner thread
+ * may unlock all znodes, exit kernel context and here
+ * we would access an invalid transaction handle. */
+ spin_unlock_zlock(&node->lock);
+ spin_unlock_atom(atomh);
+ spin_unlock_txnh(ctx->trans);
+ goto repeat;
+ }
+ assert("zam-1059", atomf != atomh);
+ spin_unlock_zlock(&node->lock);
+ atomic_inc(&atomh->refcount);
+ atomic_inc(&atomf->refcount);
+ spin_unlock_txnh(ctx->trans);
+ if (atomf > atomh) {
+ spin_lock_atom_nested(atomf);
+ } else {
+ spin_unlock_atom(atomh);
+ spin_lock_atom(atomf);
+ spin_lock_atom_nested(atomh);
+ }
+ if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
+ release_two_atoms(atomf, atomh);
+ goto repeat;
+ }
+ atomic_dec(&atomh->refcount);
+ atomic_dec(&atomf->refcount);
+ capture_fuse_into(atomf, atomh);
+ goto repeat;
+ }
+ spin_unlock_zlock(&node->lock);
+ spin_unlock_atom(atomh);
+}
+
+/* This is the interface to capture unformatted nodes via their struct page
+ reference. Currently it is only used in reiser4_invalidatepage */
+int try_capture_page_to_invalidate(struct page *pg)
+{
+ int ret;
+ jnode *node;
+
+ assert("umka-292", pg != NULL);
+ assert("nikita-2597", PageLocked(pg));
+
+ node = jnode_by_page(pg);
+ BUG_ON(node == NULL);
+ jref(node);
+
+ spin_lock_jnode(node);
+ unlock_page(pg);
+
+ ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
+ spin_unlock_jnode(node);
+ jput(node);
+ lock_page(pg);
+ return ret;
+}
+
+/* This informs the transaction manager when a node is deleted. Add the block to the
+ atom's delete set and uncapture the block.
+
+VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
+explanations. find all the functions that use it, and unless there is some very
+good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
+move the loop to inside the function.
+
+VS-FIXME-HANS: can this code be at all streamlined? In particular, can you lock and unlock the jnode fewer times?
+ */
+void reiser4_uncapture_page(struct page *pg)
+{
+ jnode *node;
+ txn_atom *atom;
+
+ assert("umka-199", pg != NULL);
+ assert("nikita-3155", PageLocked(pg));
+
+ clear_page_dirty_for_io(pg);
+
+ reiser4_wait_page_writeback(pg);
+
+ node = jprivate(pg);
+ BUG_ON(node == NULL);
+
+ spin_lock_jnode(node);
+
+ atom = jnode_get_atom(node);
+ if (atom == NULL) {
+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
+ spin_unlock_jnode(node);
+ return;
+ }
+
+ /* We can remove jnode from transaction even if it is on flush queue
+ * prepped list, we only need to be sure that flush queue is not being
+ * written by reiser4_write_fq(). reiser4_write_fq() does not use atom
+ * spin lock for protection of the prepped nodes list, instead
+ * write_fq() increments atom's nr_running_queues counters for the time
+ * when prepped list is not protected by spin lock. Here we check this
+ * counter if we want to remove jnode from flush queue and, if the
+ * counter is not zero, wait all reiser4_write_fq() for this atom to
+ * complete. This is not significant overhead. */
+ while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
+ spin_unlock_jnode(node);
+ /*
+ * at this moment we want to wait for "atom event", viz. wait
+ * until @node can be removed from flush queue. But
+ * reiser4_atom_wait_event() cannot be called with page locked,
+ * because it deadlocks with jnode_extent_write(). Unlock page,
+ * after making sure (through get_page()) that it cannot
+ * be released from memory.
+ */
+ get_page(pg);
+ unlock_page(pg);
+ reiser4_atom_wait_event(atom);
+ lock_page(pg);
+ /*
+ * page may has been detached by ->writepage()->releasepage().
+ */
+ reiser4_wait_page_writeback(pg);
+ spin_lock_jnode(node);
+ put_page(pg);
+ atom = jnode_get_atom(node);
+/* VS-FIXME-HANS: improve the commenting in this function */
+ if (atom == NULL) {
+ spin_unlock_jnode(node);
+ return;
+ }
+ }
+ reiser4_uncapture_block(node);
+ spin_unlock_atom(atom);
+ jput(node);
+}
+
+/**
+ * This is used in extent's kill hook to uncapture and unhash jnodes
+ * attached to inode's tree of jnodes.
+ * Besides, this is used to release resources (except detaching jnode's
+ * page) during data migration caused by operations on logical volumes.
+ */
+void reiser4_uncapture_jnode(jnode *node)
+{
+ txn_atom *atom;
+
+ assert_spin_locked(&(node->guard));
+
+ atom = jnode_get_atom(node);
+ if (atom == NULL) {
+ assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
+ spin_unlock_jnode(node);
+ return;
+ }
+
+ reiser4_uncapture_block(node);
+ spin_unlock_atom(atom);
+ jput(node);
+}
+
+/* No-locking version of assign_txnh. Sets the transaction handle's atom pointer,
+ increases atom refcount and txnh_count, adds to txnh_list. */
+static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
+{
+ assert("umka-200", atom != NULL);
+ assert("umka-201", txnh != NULL);
+
+ assert_spin_locked(&(txnh->hlock));
+ assert_spin_locked(&(atom->alock));
+ assert("jmacd-824", txnh->atom == NULL);
+ assert("nikita-3540", atom_isopen(atom));
+ BUG_ON(txnh->atom != NULL);
+
+ atomic_inc(&atom->refcount);
+ txnh->atom = atom;
+ reiser4_ctx_gfp_mask_set();
+ list_add_tail(&txnh->txnh_link, &atom->txnh_list);
+ atom->txnh_count += 1;
+}
+
+/* No-locking version of assign_block. Sets the block's atom pointer, references the
+ block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
+static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
+{
+ assert("umka-202", atom != NULL);
+ assert("umka-203", node != NULL);
+ assert_spin_locked(&(node->guard));
+ assert_spin_locked(&(atom->alock));
+ assert("jmacd-323", node->atom == NULL);
+ BUG_ON(!list_empty_careful(&node->capture_link));
+ assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
+
+ /* Pointer from jnode to atom is not counted in atom->refcount. */
+ node->atom = atom;
+
+ list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
+ atom->capture_count += 1;
+ /* reference to jnode is acquired by atom. */
+ jref(node);
+
+ ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
+
+ LOCK_CNT_INC(t_refs);
+}
+
+/**
+ * Common code for dirtying both unformatted jnodes and formatted znodes.
+ * Pre-condition: atom brick header should be already allocated.
+ */
+static void do_jnode_make_dirty(jnode *node, txn_atom *atom)
+{
+ assert_spin_locked(&(node->guard));
+ assert_spin_locked(&(atom->alock));
+ assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
+ assert("edward-2226", node->subvol != NULL);
+ assert("edward-2017",
+ find_atom_brick_info(&atom->bricks_info,
+ jnode_get_subvol(node)->id) != NULL);
+
+ JF_SET(node, JNODE_DIRTY);
+
+ if (!JF_ISSET(node, JNODE_CLUSTER_PAGE))
+ get_current_context()->nr_marked_dirty++;
+
+ /* We grab2flush_reserve one additional block only if node was
+ not CREATED and jnode_flush did not sort it into neither
+ relocate set nor overwrite one. If node is in overwrite or
+ relocate set we assume that atom's flush reserved counter was
+ already adjusted. */
+ if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
+ && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
+ && !jnode_is_cluster_page(node)) {
+ assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
+ assert("vs-1506", *jnode_get_block(node) != 0);
+ /*
+ * this will make a record to the atom brick header
+ */
+ grabbed2flush_reserved_nolock(atom, (__u64) 1,
+ jnode_get_subvol(node));
+ JF_SET(node, JNODE_FLUSH_RESERVED);
+ }
+
+ if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
+ /* If the atom is not set yet, it will be added to the appropriate list in
+ capture_assign_block_nolock. */
+ /* Sometimes a node is set dirty before being captured -- the case for new
+ jnodes. In that case the jnode will be added to the appropriate list
+ in capture_assign_block_nolock. Another reason not to re-link jnode is
+ that jnode is on a flush queue (see flush.c for details) */
+
+ int level = jnode_get_level(node);
+
+ assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
+ assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
+ assert("nikita-2607", 0 <= level);
+ assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
+
+ /* move node to atom's dirty list */
+ list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
+ ON_DEBUG(count_jnode
+ (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
+ }
+}
+
+/* Set the dirty status for this (spin locked) jnode. */
+void jnode_make_dirty_locked(jnode * node)
+{
+ assert("umka-204", node != NULL);
+ assert("edward-2227", node->subvol != NULL);
+ assert_spin_locked(&(node->guard));
+
+ if (REISER4_DEBUG && rofs_jnode(node)) {
+ warning("nikita-3365", "Dirtying jnode on rofs");
+ dump_stack();
+ }
+
+ /* Fast check for already dirty node */
+ if (!JF_ISSET(node, JNODE_DIRTY)) {
+ txn_atom *atom;
+
+ atom = jnode_get_atom(node);
+ assert("vs-1094", atom);
+ /* Check jnode dirty status again because node spin lock might
+ * be released inside jnode_get_atom(). */
+ if (likely(!JF_ISSET(node, JNODE_DIRTY)))
+ do_jnode_make_dirty(node, atom);
+ spin_unlock_atom(atom);
+ }
+}
+
+/* Set the dirty status for this znode. */
+void znode_make_dirty(znode * z)
+{
+ jnode *node;
+ struct page *page;
+
+ assert("umka-204", z != NULL);
+ assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
+ assert("nikita-3560", znode_is_write_locked(z));
+
+ node = ZJNODE(z);
+ /* znode is longterm locked, we can check dirty bit without spinlock */
+ if (JF_ISSET(node, JNODE_DIRTY)) {
+ /* znode is dirty already. All we have to do is to change znode version */
+ z->version = znode_build_version(znode_get_tree(z));
+ return;
+ }
+
+ spin_lock_jnode(node);
+ jnode_make_dirty_locked(node);
+ page = jnode_page(node);
+ if (page != NULL) {
+ /* this is useful assertion (allows one to check that no
+ * modifications are lost due to update of in-flight page),
+ * but it requires locking on page to check PG_writeback
+ * bit. */
+ /* assert("nikita-3292",
+ !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
+ get_page(page);
+
+ /* jnode lock is not needed for the rest of
+ * znode_set_dirty(). */
+ spin_unlock_jnode(node);
+ /* reiser4 file write code calls set_page_dirty for
+ * unformatted nodes, for formatted nodes we do it here. */
+ set_page_dirty_notag(page);
+ put_page(page);
+ /* bump version counter in znode */
+ z->version = znode_build_version(znode_get_tree(z));
+ } else {
+ assert("zam-596", znode_above_root(JZNODE(node)));
+ spin_unlock_jnode(node);
+ }
+
+ assert("nikita-1900", znode_is_write_locked(z));
+ assert("jmacd-9777", node->atom != NULL);
+}
+
+int reiser4_sync_atom(txn_atom *atom)
+{
+ int result;
+ txn_handle *txnh;
+
+ txnh = get_current_context()->trans;
+
+ result = 0;
+ if (atom != NULL) {
+ if (atom->stage < ASTAGE_PRE_COMMIT) {
+ spin_lock_txnh(txnh);
+ capture_assign_txnh_nolock(atom, txnh);
+ result = force_commit_atom(txnh);
+ } else if (atom->stage < ASTAGE_POST_COMMIT) {
+ /* wait atom commit */
+ reiser4_atom_wait_event(atom);
+ /* try once more */
+ result = RETERR(-E_REPEAT);
+ } else
+ spin_unlock_atom(atom);
+ }
+ return result;
+}
+
+#if REISER4_DEBUG
+
+/* move jnode form one list to another
+ call this after atom->capture_count is updated */
+void
+count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
+ atom_list new_list, int check_lists)
+{
+ struct list_head *pos;
+
+ assert("zam-1018", atom_is_protected(atom));
+ assert_spin_locked(&(node->guard));
+ assert("", NODE_LIST(node) == old_list);
+
+ switch (NODE_LIST(node)) {
+ case NOT_CAPTURED:
+ break;
+ case DIRTY_LIST:
+ assert("", atom->dirty > 0);
+ atom->dirty--;
+ break;
+ case CLEAN_LIST:
+ assert("", atom->clean > 0);
+ atom->clean--;
+ break;
+ case FQ_LIST:
+ assert("", atom->fq > 0);
+ atom->fq--;
+ break;
+ case WB_LIST:
+ assert("", atom->wb > 0);
+ atom->wb--;
+ break;
+ case OVRWR_LIST:
+ assert("", atom->ovrwr > 0);
+ atom->ovrwr--;
+ break;
+ default:
+ impossible("", "");
+ }
+
+ switch (new_list) {
+ case NOT_CAPTURED:
+ break;
+ case DIRTY_LIST:
+ atom->dirty++;
+ break;
+ case CLEAN_LIST:
+ atom->clean++;
+ break;
+ case FQ_LIST:
+ atom->fq++;
+ break;
+ case WB_LIST:
+ atom->wb++;
+ break;
+ case OVRWR_LIST:
+ atom->ovrwr++;
+ break;
+ default:
+ impossible("", "");
+ }
+ ASSIGN_NODE_LIST(node, new_list);
+ if (0 && check_lists) {
+ int count;
+ tree_level level;
+
+ count = 0;
+
+ /* flush queue list */
+ /* reiser4_check_fq(atom); */
+
+ /* dirty list */
+ count = 0;
+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
+ list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
+ count++;
+ }
+ if (count != atom->dirty)
+ warning("", "dirty counter %d, real %d\n", atom->dirty,
+ count);
+
+ /* clean list */
+ count = 0;
+ list_for_each(pos, ATOM_CLEAN_LIST(atom))
+ count++;
+ if (count != atom->clean)
+ warning("", "clean counter %d, real %d\n", atom->clean,
+ count);
+
+ /* wb list */
+ count = 0;
+ list_for_each(pos, ATOM_WB_LIST(atom))
+ count++;
+ if (count != atom->wb)
+ warning("", "wb counter %d, real %d\n", atom->wb,
+ count);
+
+ /* overwrite list */
+ count = 0;
+ list_for_each(pos, ATOM_OVRWR_LIST(atom))
+ count++;
+
+ if (count != atom->ovrwr)
+ warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
+ count);
+ }
+ assert("vs-1624", atom->num_queued == atom->fq);
+ if (atom->capture_count !=
+ atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
+ printk
+ ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
+ atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
+ atom->wb, atom->fq);
+ assert("vs-1622",
+ atom->capture_count ==
+ atom->dirty + atom->clean + atom->ovrwr + atom->wb +
+ atom->fq);
+ }
+}
+
+#endif
+
+int capture_brick_super(reiser4_subvol *subv)
+{
+ int result;
+ /*
+ * Grab space for a superblock copy update
+ */
+ result = reiser4_grab_space_force((__u64)1, BA_RESERVED, subv);
+ if (result != 0)
+ return result;
+ {
+ znode *uber;
+ lock_handle lh;
+ struct atom_brick_info *abi;
+
+ init_lh(&lh);
+ result = get_uber_znode(&subv->tree,
+ ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI,
+ &lh);
+ if (result)
+ return result;
+
+ result = check_insert_atom_brick_info(subv->id, &abi);
+ if (result) {
+ done_lh(&lh);
+ return result;
+ }
+ uber = lh.node;
+ znode_make_dirty(uber);
+ done_lh(&lh);
+ }
+ return 0;
+}
+
+/**
+ * Wakeup every handle on the atom's WAITFOR list
+ */
+static void wakeup_atom_waitfor_list(txn_atom * atom)
+{
+ txn_wait_links *wlinks;
+
+ assert("umka-210", atom != NULL);
+
+ /* atom is locked */
+ list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
+ if (wlinks->waitfor_cb == NULL ||
+ wlinks->waitfor_cb(atom, wlinks))
+ /* Wake up. */
+ reiser4_wake_up(wlinks->_lock_stack);
+ }
+}
+
+/*
+ * Wakeup every handle on the atom's WAITING list
+ */
+static void wakeup_atom_waiting_list(txn_atom * atom)
+{
+ txn_wait_links *wlinks;
+
+ assert("umka-211", atom != NULL);
+
+ /* atom is locked */
+ list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
+ if (wlinks->waiting_cb == NULL ||
+ wlinks->waiting_cb(atom, wlinks))
+ /* Wake up. */
+ reiser4_wake_up(wlinks->_lock_stack);
+ }
+}
+
+/**
+ * helper function used by capture_fuse_wait() to avoid "spurious wake-ups"
+ */
+static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
+{
+ assert("nikita-3330", atom != NULL);
+ assert_spin_locked(&(atom->alock));
+
+ /* atom->txnh_count == 1 is for waking waiters up if we are releasing
+ * last transaction handle. */
+ return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
+}
+
+/* The general purpose of this function is to wait on the first of two possible events.
+ The situation is that a handle (and its atom atomh) is blocked trying to capture a
+ block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state. The
+ handle's atom (atomh) is not in the CAPTURE_WAIT state. However, atomh could fuse with
+ another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
+ needs to unblock the handle to avoid deadlock. When the txnh is unblocked it will
+ proceed and fuse the two atoms in the CAPTURE_WAIT state.
+
+ In other words, if either atomh or atomf change state, the handle will be awakened,
+ thus there are two lists per atom: WAITING and WAITFOR.
+
+ This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
+ close but it is not assigned to an atom of its own.
+
+ Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
+ BOTH_ATOM_LOCKS. Result: all four locks are released.
+*/
+static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
+ txn_atom * atomh, txn_capture mode)
+{
+ int ret;
+ txn_wait_links wlinks;
+
+ assert("umka-213", txnh != NULL);
+ assert("umka-214", atomf != NULL);
+
+ if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
+ spin_unlock_txnh(txnh);
+ spin_unlock_atom(atomf);
+
+ if (atomh) {
+ spin_unlock_atom(atomh);
+ }
+
+ return RETERR(-E_BLOCK);
+ }
+
+ /* Initialize the waiting list links. */
+ init_wlinks(&wlinks);
+
+ /* Add txnh to atomf's waitfor list, unlock atomf. */
+ list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
+ wlinks.waitfor_cb = wait_for_fusion;
+ atomic_inc(&atomf->refcount);
+ spin_unlock_atom(atomf);
+
+ if (atomh) {
+ /* Add txnh to atomh's waiting list, unlock atomh. */
+ list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
+ atomic_inc(&atomh->refcount);
+ spin_unlock_atom(atomh);
+ }
+
+ /* Go to sleep. */
+ spin_unlock_txnh(txnh);
+
+ ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
+ if (ret == 0) {
+ reiser4_go_to_sleep(wlinks._lock_stack);
+ ret = RETERR(-E_REPEAT);
+ }
+
+ /* Remove from the waitfor list. */
+ spin_lock_atom(atomf);
+
+ list_del(&wlinks._fwaitfor_link);
+ atom_dec_and_unlock(atomf);
+
+ if (atomh) {
+ /* Remove from the waiting list. */
+ spin_lock_atom(atomh);
+ list_del(&wlinks._fwaiting_link);
+ atom_dec_and_unlock(atomh);
+ }
+ return ret;
+}
+
+static void lock_two_atoms(txn_atom * one, txn_atom * two)
+{
+ assert("zam-1067", one != two);
+
+ /* lock the atom with lesser address first */
+ if (one < two) {
+ spin_lock_atom(one);
+ spin_lock_atom_nested(two);
+ } else {
+ spin_lock_atom(two);
+ spin_lock_atom_nested(one);
+ }
+}
+
+/* Perform the necessary work to prepare for fusing two atoms, which involves
+ * acquiring two atom locks in the proper order. If one of the node's atom is
+ * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
+ * atom is not then the handle's request is put to sleep. If the node's atom
+ * is committing, then the node can be copy-on-captured. Otherwise, pick the
+ * atom with fewer pointers to be fused into the atom with more pointer and
+ * call capture_fuse_into.
+ */
+static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
+{
+ txn_atom * txnh_atom = txnh->atom;
+ txn_atom * block_atom = node->atom;
+
+ atomic_inc(&txnh_atom->refcount);
+ atomic_inc(&block_atom->refcount);
+
+ spin_unlock_txnh(txnh);
+ spin_unlock_jnode(node);
+
+ lock_two_atoms(txnh_atom, block_atom);
+
+ if (txnh->atom != txnh_atom || node->atom != block_atom ) {
+ release_two_atoms(txnh_atom, block_atom);
+ return RETERR(-E_REPEAT);
+ }
+
+ atomic_dec(&txnh_atom->refcount);
+ atomic_dec(&block_atom->refcount);
+
+ assert ("zam-1066", atom_isopen(txnh_atom));
+
+ if (txnh_atom->stage >= block_atom->stage ||
+ (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
+ capture_fuse_into(txnh_atom, block_atom);
+ return RETERR(-E_REPEAT);
+ }
+ spin_lock_txnh(txnh);
+ return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
+}
+
+/* This function splices together two jnode lists (small and large) and sets all jnodes in
+ the small list to point to the large atom. Returns the length of the list. */
+static int
+capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
+ struct list_head *small_head)
+{
+ int count = 0;
+ jnode *node;
+
+ assert("umka-218", large != NULL);
+ assert("umka-219", large_head != NULL);
+ assert("umka-220", small_head != NULL);
+ /* small atom should be locked also. */
+ assert_spin_locked(&(large->alock));
+
+ /* For every jnode on small's capture list... */
+ list_for_each_entry(node, small_head, capture_link) {
+ count += 1;
+
+ /* With the jnode lock held, update atom pointer. */
+ spin_lock_jnode(node);
+ node->atom = large;
+ spin_unlock_jnode(node);
+ }
+
+ /* Splice the lists. */
+ list_splice_init(small_head, large_head->prev);
+
+ return count;
+}
+
+/* This function splices together two txnh lists (small and large) and sets all txn handles in
+ the small list to point to the large atom. Returns the length of the list. */
+static int
+capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
+ struct list_head *small_head)
+{
+ int count = 0;
+ txn_handle *txnh;
+
+ assert("umka-221", large != NULL);
+ assert("umka-222", large_head != NULL);
+ assert("umka-223", small_head != NULL);
+
+ /* Adjust every txnh to the new atom. */
+ list_for_each_entry(txnh, small_head, txnh_link) {
+ count += 1;
+
+ /* With the txnh lock held, update atom pointer. */
+ spin_lock_txnh(txnh);
+ txnh->atom = large;
+ spin_unlock_txnh(txnh);
+ }
+
+ /* Splice the txn_handle list. */
+ list_splice_init(small_head, large_head->prev);
+
+ return count;
+}
+
+/* This function fuses two atoms. The captured nodes and handles belonging to SMALL are
+ added to LARGE and their ->atom pointers are all updated. The associated counts are
+ updated as well, and any waiting handles belonging to either are awakened. Finally the
+ smaller atom's refcount is decremented.
+*/
+static void capture_fuse_into(txn_atom * small, txn_atom * large)
+{
+ int level;
+ unsigned zcount = 0;
+ unsigned tcount = 0;
+
+ assert("umka-224", small != NULL);
+ assert("umka-225", small != NULL);
+
+ assert_spin_locked(&(large->alock));
+ assert_spin_locked(&(small->alock));
+
+ assert("jmacd-201", atom_isopen(small));
+ assert("jmacd-202", atom_isopen(large));
+
+ /* Splice and update the per-level dirty jnode lists */
+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
+ zcount +=
+ capture_fuse_jnode_lists(large,
+ ATOM_DIRTY_LIST(large, level),
+ ATOM_DIRTY_LIST(small, level));
+ }
+
+ /* Splice and update the [clean,dirty] jnode and txnh lists */
+ zcount +=
+ capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
+ ATOM_CLEAN_LIST(small));
+ zcount +=
+ capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
+ ATOM_OVRWR_LIST(small));
+ zcount +=
+ capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
+ ATOM_WB_LIST(small));
+ zcount +=
+ capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
+ tcount +=
+ capture_fuse_txnh_lists(large, &large->txnh_list,
+ &small->txnh_list);
+
+ /* Check our accounting. */
+ assert("jmacd-1063",
+ zcount + small->num_queued == small->capture_count);
+ assert("jmacd-1065", tcount == small->txnh_count);
+
+ /* sum numbers of waiters threads */
+ large->nr_waiters += small->nr_waiters;
+ small->nr_waiters = 0;
+
+ /* splice flush queues */
+ reiser4_fuse_fq(large, small);
+
+ /* update counter of jnode on every atom' list */
+ ON_DEBUG(large->dirty += small->dirty;
+ small->dirty = 0;
+ large->clean += small->clean;
+ small->clean = 0;
+ large->ovrwr += small->ovrwr;
+ small->ovrwr = 0;
+ large->wb += small->wb;
+ small->wb = 0;
+ large->fq += small->fq;
+ small->fq = 0;);
+
+ /* count flushers in result atom */
+ large->nr_flushers += small->nr_flushers;
+ small->nr_flushers = 0;
+
+ /* update counts of flushed nodes */
+ large->flushed += small->flushed;
+ small->flushed = 0;
+
+ /* Transfer list counts to large. */
+ large->txnh_count += small->txnh_count;
+ large->capture_count += small->capture_count;
+
+ /* Add all txnh references to large. */
+ atomic_add(small->txnh_count, &large->refcount);
+ atomic_sub(small->txnh_count, &small->refcount);
+
+ /* Reset small counts */
+ small->txnh_count = 0;
+ small->capture_count = 0;
+
+ /* Assign the oldest start_time, merge flags. */
+ large->start_time = min(large->start_time, small->start_time);
+ large->flags |= small->flags;
+
+ /* Merge delete sets. */
+ atom_dset_merge(small, large);
+
+ /* Merge allocated/deleted file counts */
+ large->nr_objects_deleted += small->nr_objects_deleted;
+ large->nr_objects_created += small->nr_objects_created;
+
+ small->nr_objects_deleted = 0;
+ small->nr_objects_created = 0;
+
+ large->nr_running_queues += small->nr_running_queues;
+ small->nr_running_queues = 0;
+
+ fuse_abi(small, large);
+
+ if (large->stage < small->stage) {
+ /* Large only needs to notify if it has changed state. */
+ reiser4_atom_set_stage(large, small->stage);
+ wakeup_atom_waiting_list(large);
+ }
+ reiser4_atom_set_stage(small, ASTAGE_INVALID);
+
+ /* Notify any waiters--small needs to unload its wait lists. Waiters
+ actually remove themselves from the list before returning from the
+ fuse_wait function. */
+ wakeup_atom_waiting_list(small);
+
+ /* Unlock atoms */
+ spin_unlock_atom(large);
+ atom_dec_and_unlock(small);
+}
+
+/* TXNMGR STUFF */
+
+/* Release a block from the atom, reversing the effects of being captured,
+ do not release atom's reference to jnode due to holding spin-locks.
+ Currently this is only called when the atom commits.
+
+ NOTE: this function does not release a (journal) reference to jnode
+ due to locking optimizations, you should call jput() somewhere after
+ calling reiser4_uncapture_block(). */
+void reiser4_uncapture_block(jnode *node)
+{
+ txn_atom *atom;
+
+ assert("umka-226", node != NULL);
+ atom = node->atom;
+ assert("umka-228", atom != NULL);
+
+ assert("jmacd-1021", node->atom == atom);
+ assert_spin_locked(&(node->guard));
+ assert("jmacd-1023", atom_is_protected(atom));
+
+ JF_CLR(node, JNODE_DIRTY);
+ JF_CLR(node, JNODE_RELOC);
+ JF_CLR(node, JNODE_OVRWR);
+ JF_CLR(node, JNODE_CREATED);
+ JF_CLR(node, JNODE_WRITEBACK);
+ JF_CLR(node, JNODE_REPACK);
+
+ list_del_init(&node->capture_link);
+ if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
+ assert("zam-925", atom_isopen(atom));
+ assert("vs-1623", NODE_LIST(node) == FQ_LIST);
+ ON_DEBUG(atom->num_queued--);
+ JF_CLR(node, JNODE_FLUSH_QUEUED);
+ }
+ atom->capture_count -= 1;
+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
+ node->atom = NULL;
+
+ spin_unlock_jnode(node);
+ LOCK_CNT_DEC(t_refs);
+}
+
+/*
+ * Unconditional insert of jnode into atom's overwrite list.
+ * Currently used in bitmap-based allocator code for adding
+ * modified bitmap blocks the transaction. @atom and @node
+ * are spin locked
+ */
+void insert_into_atom_ovrwr_list(txn_atom *atom, jnode *node)
+{
+ assert("zam-538", atom_is_protected(atom));
+ assert_spin_locked(&(node->guard));
+ assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
+ assert("zam-543", node->atom == NULL);
+ assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
+
+ list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
+ jref(node);
+ node->atom = atom;
+ atom->capture_count++;
+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
+}
+
+void insert_into_subv_ovrwr_list(reiser4_subvol *subv, jnode *node,
+ txn_atom *atom)
+{
+ assert("edward-1775", node->subvol == subv);
+
+ list_add(&node->capture_link, &subv->ch.overwrite_set);
+ jref(node);
+ node->atom = atom;
+ atom->capture_count++;
+ ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
+}
+
+static int count_deleted_blocks_actor(txn_atom *atom,
+ const reiser4_block_nr *a,
+ const reiser4_block_nr *b,
+ __u32 subvol_id, void *data)
+{
+ reiser4_block_nr *counter = data;
+
+ assert("zam-995", data != NULL);
+ assert("zam-996", a != NULL);
+
+ if (b == NULL)
+ *counter += 1;
+ else
+ *counter += *b;
+ return 0;
+}
+
+reiser4_block_nr txnmgr_count_deleted_blocks(void)
+{
+ reiser4_block_nr result;
+ txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
+ txn_atom *atom;
+
+ result = 0;
+
+ spin_lock_txnmgr(tmgr);
+ list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
+ spin_lock_atom(atom);
+ if (atom_isopen(atom))
+ atom_dset_deferred_apply(atom, count_deleted_blocks_actor, &result, 0);
+ spin_unlock_atom(atom);
+ }
+ spin_unlock_txnmgr(tmgr);
+
+ return result;
+}
+
+void atom_dset_init(txn_atom *atom)
+{
+ if (1) {
+ blocknr_list_init(&atom->discard.delete_set);
+ } else {
+ blocknr_set_init(&atom->nodiscard.delete_set);
+ }
+}
+
+void atom_dset_destroy(txn_atom *atom)
+{
+ if (1) {
+ blocknr_list_destroy(&atom->discard.delete_set);
+ } else {
+ blocknr_set_destroy(&atom->nodiscard.delete_set);
+ }
+}
+
+void atom_dset_merge(txn_atom *from, txn_atom *to)
+{
+ if (1) {
+ blocknr_list_merge(&from->discard.delete_set, &to->discard.delete_set);
+ } else {
+ blocknr_set_merge(&from->nodiscard.delete_set, &to->nodiscard.delete_set);
+ }
+}
+
+int atom_dset_deferred_apply(txn_atom* atom,
+ blocknr_set_actor_f actor,
+ void *data,
+ int delete)
+{
+ int ret;
+
+ if (1) {
+ ret = blocknr_list_iterator(atom,
+ &atom->discard.delete_set,
+ actor,
+ data,
+ delete);
+ }
+#if 0
+ else {
+ ret = blocknr_set_iterator(atom,
+ &atom->nodiscard.delete_set,
+ actor,
+ data,
+ delete);
+ }
+#endif
+ return ret;
+}
+
+extern int atom_dset_deferred_add_extent(txn_atom *atom,
+ void **new_entry,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ __u32 subvol_id)
+{
+ int ret;
+
+ if (1) {
+ ret = blocknr_list_add_extent(atom,
+ &atom->discard.delete_set,
+ (blocknr_list_entry**)new_entry,
+ start,
+ len,
+ subvol_id);
+ }
+#if 0
+ else {
+ ret = blocknr_set_add_extent(atom,
+ &atom->nodiscard.delete_set,
+ (blocknr_set_entry**)new_entry,
+ start,
+ len,
+ subvol_id);
+ }
+#endif
+ return ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 79
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/txnmgr.h linux-5.10.2/fs/reiser4/txnmgr.h
--- linux-5.10.2.orig/fs/reiser4/txnmgr.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/txnmgr.h 2020-12-23 16:07:46.136813392 +0100
@@ -0,0 +1,809 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* data-types and function declarations for transaction manager. See txnmgr.c
+ * for details. */
+
+#ifndef __REISER4_TXNMGR_H__
+#define __REISER4_TXNMGR_H__
+
+#include "forward.h"
+#include "dformat.h"
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/wait.h>
+
+/* TYPE DECLARATIONS */
+
+/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
+ A capture request dynamically assigns a block to the calling thread's transaction
+ handle. */
+typedef enum {
+ /* A READ_ATOMIC request indicates that a block will be read and that the caller's
+ atom should fuse in order to ensure that the block commits atomically with the
+ caller. */
+ TXN_CAPTURE_READ_ATOMIC = (1 << 0),
+
+ /* A READ_NONCOM request indicates that a block will be read and that the caller is
+ willing to read a non-committed block without causing atoms to fuse. */
+ TXN_CAPTURE_READ_NONCOM = (1 << 1),
+
+ /* A READ_MODIFY request indicates that a block will be read but that the caller
+ wishes for the block to be captured as it will be written. This capture request
+ mode is not currently used, but eventually it will be useful for preventing
+ deadlock in read-modify-write cycles. */
+ TXN_CAPTURE_READ_MODIFY = (1 << 2),
+
+ /* A WRITE capture request indicates that a block will be modified and that atoms
+ should fuse to make the commit atomic. */
+ TXN_CAPTURE_WRITE = (1 << 3),
+
+ /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
+ exclusive type designation from extra bits that may be supplied -- see
+ below. */
+ TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
+ TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
+ TXN_CAPTURE_WRITE),
+
+ /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
+ indicate modification will occur. */
+ TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
+
+ /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
+ prefer not to sleep waiting for an aging atom to commit. */
+ TXN_CAPTURE_NONBLOCKING = (1 << 4),
+
+ /* An option to reiser4_try_capture to prevent atom fusion, just simple
+ capturing is allowed */
+ TXN_CAPTURE_DONT_FUSE = (1 << 5)
+
+ /* This macro selects only the exclusive capture request types, stripping out any
+ options that were supplied (i.e., NONBLOCKING). */
+#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
+} txn_capture;
+
+/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
+ difference is in the handling of read requests. A WRITE_FUSING transaction handle
+ defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
+ transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
+typedef enum {
+ TXN_WRITE_FUSING = (1 << 0),
+ TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING, /* READ implies WRITE */
+} txn_mode;
+
+/* Every atom has a stage, which is one of these exclusive values: */
+typedef enum {
+ /* Initially an atom is free. */
+ ASTAGE_FREE = 0,
+
+ /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
+ blocks and fuse with other atoms. */
+ ASTAGE_CAPTURE_FUSE = 1,
+
+ /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
+
+ /* When an atom reaches a certain age it must do all it can to commit. An atom in
+ the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
+ atoms in the CAPTURE_FUSE stage. */
+ ASTAGE_CAPTURE_WAIT = 2,
+
+ /* Waiting for I/O before commit. Copy-on-capture (see
+ http://namesys.com/v4/v4.html). */
+ ASTAGE_PRE_COMMIT = 3,
+
+ /* Post-commit overwrite I/O. Steal-on-capture. */
+ ASTAGE_POST_COMMIT = 4,
+
+ /* Atom which waits for the removal of the last reference to (it? ) to
+ * be deleted from memory */
+ ASTAGE_DONE = 5,
+
+ /* invalid atom. */
+ ASTAGE_INVALID = 6,
+
+} txn_stage;
+
+/* Certain flags may be set in the txn_atom->flags field. */
+typedef enum {
+ /* Indicates that the atom should commit as soon as possible. */
+ ATOM_FORCE_COMMIT = (1 << 0),
+ /* to avoid endless loop, mark the atom (which was considered as too
+ * small) after failed attempt to fuse it. */
+ ATOM_CANCEL_FUSION = (1 << 1)
+} txn_flags;
+
+/* Flags for controlling commit_txnh */
+typedef enum {
+ /* Wait commit atom completion in commit_txnh */
+ TXNH_WAIT_COMMIT = 0x2,
+ /* Don't commit atom when this handle is closed */
+ TXNH_DONT_COMMIT = 0x4
+} txn_handle_flags_t;
+
+/* TYPE DEFINITIONS */
+
+/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
+ fields, so typically an operation on the atom through either of these objects must (1)
+ lock the object, (2) read the atom pointer, (3) lock the atom.
+
+ During atom fusion, the process holds locks on both atoms at once. Then, it iterates
+ through the list of handles and pages held by the smaller of the two atoms. For each
+ handle and page referencing the smaller atom, the fusing process must: (1) lock the
+ object, and (2) update the atom pointer.
+
+ You can see that there is a conflict of lock ordering here, so the more-complex
+ procedure should have priority, i.e., the fusing process has priority so that it is
+ guaranteed to make progress and to avoid restarts.
+
+ This decision, however, means additional complexity for aquiring the atom lock in the
+ first place.
+
+ The general original procedure followed in the code was:
+
+ TXN_OBJECT *obj = ...;
+ TXN_ATOM *atom;
+
+ spin_lock (& obj->_lock);
+
+ atom = obj->_atom;
+
+ if (! spin_trylock_atom (atom))
+ {
+ spin_unlock (& obj->_lock);
+ RESTART OPERATION, THERE WAS A RACE;
+ }
+
+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
+
+ It has however been found that this wastes CPU a lot in a manner that is
+ hard to profile. So, proper refcounting was added to atoms, and new
+ standard locking sequence is like following:
+
+ TXN_OBJECT *obj = ...;
+ TXN_ATOM *atom;
+
+ spin_lock (& obj->_lock);
+
+ atom = obj->_atom;
+
+ if (! spin_trylock_atom (atom))
+ {
+ atomic_inc (& atom->refcount);
+ spin_unlock (& obj->_lock);
+ spin_lock (&atom->_lock);
+ atomic_dec (& atom->refcount);
+ // HERE atom is locked
+ spin_unlock (&atom->_lock);
+ RESTART OPERATION, THERE WAS A RACE;
+ }
+
+ ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
+
+ (core of this is implemented in trylock_throttle() function)
+
+ See the jnode_get_atom() function for a common case.
+
+ As an additional (and important) optimization allowing to avoid restarts,
+ it is possible to re-check required pre-conditions at the HERE point in
+ code above and proceed without restarting if they are still satisfied.
+*/
+
+struct atom_brick_info {
+ struct rb_node node;
+ u32 brick_id; /* key */
+ reiser4_block_nr nr_blocks_allocated; /* number of blocks allocated
+ during the transaction */
+ reiser4_block_nr atom_flush_reserved; /* counter of blocks reserved
+ for flush and commit, see
+ reiser4 space reservation
+ scheme at block_alloc.c */
+};
+
+/* An atomic transaction: this is the underlying system representation
+ of a transaction, not the one seen by clients.
+
+ Invariants involving this data-type:
+
+ [sb-fake-allocated]
+*/
+struct txn_atom {
+ /* The spinlock protecting the atom, held during fusion and various other state
+ changes. */
+ spinlock_t alock;
+
+ /* The atom's reference counter, increasing (in case of a duplication
+ of an existing reference or when we are sure that some other
+ reference exists) may be done without taking spinlock, decrementing
+ of the ref. counter requires a spinlock to be held.
+
+ Each transaction handle counts in ->refcount. All jnodes count as
+ one reference acquired in atom_begin_andlock(), released in
+ commit_current_atom().
+ */
+ atomic_t refcount;
+
+ /* The atom_id identifies the atom in persistent records such as the log. */
+ __u32 atom_id;
+
+ /* Flags holding any of the txn_flags enumerated values (e.g.,
+ ATOM_FORCE_COMMIT). */
+ __u32 flags;
+
+ /* Number of open handles. */
+ __u32 txnh_count;
+ /* The number of znodes captured by this atom. Equal to the sum of lengths of the
+ dirty_nodes[level] and clean_nodes lists. */
+ __u32 capture_count;
+
+#if REISER4_DEBUG
+ int clean;
+ int dirty;
+ int ovrwr;
+ int wb;
+ int fq;
+ atom_brick_info *abi;
+ atom_brick_info *abi_found;
+#endif
+
+ __u32 flushed;
+
+ /* Current transaction stage. */
+ txn_stage stage;
+
+ /* Start time. */
+ unsigned long start_time;
+
+ /* The atom's delete sets.
+ "simple" are blocknr_set instances and are used when discard is disabled.
+ "discard" are blocknr_list instances and are used when discard is enabled. */
+ union {
+ struct {
+ /* The atom's delete set. It collects block numbers of the nodes
+ which were deleted during the transaction. */
+ struct list_head delete_set;
+ } nodiscard;
+
+ struct {
+ /* The atom's delete set. It collects all blocks that have been
+ deallocated (both immediate and deferred) during the transaction.
+ These blocks are considered for discarding at commit time.
+ For details see discard.c */
+ struct list_head delete_set;
+ } discard;
+ };
+
+ /* The transaction's list of dirty captured nodes--per level. Index
+ by (level). dirty_nodes[0] is for znode-above-root */
+ struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
+
+ /* The transaction's list of clean captured nodes. */
+ struct list_head clean_nodes;
+
+ /* The atom's overwrite set */
+ struct list_head ovrwr_nodes;
+
+ /* nodes which are being written to disk */
+ struct list_head writeback_nodes;
+
+ /* list of inodes */
+ struct list_head inodes;
+
+ /* List of handles associated with this atom. */
+ struct list_head txnh_list;
+
+ /* Transaction list link: list of atoms in the transaction manager. */
+ struct list_head atom_link;
+
+ /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
+ struct list_head fwaitfor_list;
+
+ /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
+ struct list_head fwaiting_list;
+
+ /* Numbers of objects which were deleted/created in this transaction
+ thereby numbers of objects IDs which were released/deallocated. */
+ int nr_objects_deleted;
+ int nr_objects_created;
+ /* All atom's flush queue objects are on this list */
+ struct list_head flush_queues;
+#if REISER4_DEBUG
+ /* number of flush queues for this atom. */
+ int nr_flush_queues;
+ /* Number of jnodes which were removed from atom's lists and put
+ on flush_queue */
+ int num_queued;
+#endif
+ /* number of threads who wait for this atom to complete commit */
+ int nr_waiters;
+ /* number of threads which do jnode_flush() over this atom */
+ int nr_flushers;
+ /* number of flush queues which are IN_USE and jnodes from fq->prepped
+ are submitted to disk by the reiser4_write_fq() routine. */
+ int nr_running_queues;
+
+ struct rb_root bricks_info;
+ struct atom_brick_info mabi; /* pre-allocated meta-data brick info */
+#if REISER4_DEBUG
+ void *committer;
+#endif
+ struct super_block *super;
+};
+
+#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
+#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
+#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
+#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
+#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
+
+#define NODE_LIST(node) (node)->list
+#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
+ON_DEBUG(void
+ count_jnode(txn_atom *, jnode *, atom_list old_list,
+ atom_list new_list, int check_lists));
+
+/* A transaction handle: the client obtains and commits this handle which is assigned by
+ the system to a txn_atom. */
+struct txn_handle {
+ /* Spinlock protecting ->atom pointer */
+ spinlock_t hlock;
+
+ /* Flags for controlling commit_txnh() behavior */
+ /* from txn_handle_flags_t */
+ txn_handle_flags_t flags;
+
+ /* Whether it is READ_FUSING or WRITE_FUSING. */
+ txn_mode mode;
+
+ /* If assigned, the atom it is part of. */
+ txn_atom *atom;
+
+ /* Transaction list link. Head is in txn_atom. */
+ struct list_head txnh_link;
+};
+
+/* The transaction manager: one is contained in the reiser4_super_info_data */
+struct txn_mgr {
+ /* A spinlock protecting the atom list, id_count, flush_control */
+ spinlock_t tmgr_lock;
+
+ /* List of atoms. */
+ struct list_head atoms_list;
+
+ /* Number of atoms. */
+ int atom_count;
+
+ /* A counter used to assign atom->atom_id values. */
+ __u32 id_count;
+
+ /* a mutex object for commit serialization */
+ struct mutex commit_mutex;
+
+ /* a list of all txnmrgs served by particular daemon. */
+ struct list_head linkage;
+
+ /* description of daemon for this txnmgr */
+ ktxnmgrd_context *daemon;
+
+ /* parameters. Adjustable through mount options. */
+ unsigned int atom_max_size;
+ unsigned int atom_max_age;
+ unsigned int atom_min_size;
+ /* max number of concurrent flushers for one atom, 0 - unlimited. */
+ unsigned int atom_max_flushers;
+ struct dentry *debugfs_atom_count;
+ struct dentry *debugfs_id_count;
+};
+
+/* FUNCTION DECLARATIONS */
+
+/* These are the externally (within Reiser4) visible transaction functions, therefore they
+ are prefixed with "txn_". For comments, see txnmgr.c. */
+
+extern int init_txnmgr_static(void);
+extern void done_txnmgr_static(void);
+
+extern void reiser4_init_txnmgr(txn_mgr *);
+extern void reiser4_done_txnmgr(txn_mgr *);
+
+extern int reiser4_txn_reserve(int reserved);
+
+extern void reiser4_txn_begin(reiser4_context * context);
+extern int reiser4_txn_end(reiser4_context * context);
+
+extern void reiser4_txn_restart(reiser4_context * context);
+extern void reiser4_txn_restart_current(void);
+
+extern int txnmgr_force_commit_all(struct super_block *, int);
+extern int current_atom_should_commit(void);
+
+extern jnode *find_first_dirty_jnode(txn_atom *, int);
+
+extern int commit_some_atoms(txn_mgr *);
+extern int force_commit_atom(txn_handle *);
+extern int force_commit_current_atom(void);
+
+extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
+
+extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
+
+extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
+
+extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
+ int alloc_value);
+extern void atom_dec_and_unlock(txn_atom * atom);
+
+extern int reiser4_create_atom(void);
+extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
+extern int try_capture_page_to_invalidate(struct page *pg);
+
+extern void reiser4_uncapture_page(struct page *pg);
+extern void reiser4_uncapture_block(jnode *node);
+extern void reiser4_uncapture_jnode(jnode *node);
+
+extern int reiser4_capture_inode(struct inode *);
+extern int reiser4_uncapture_inode(struct inode *);
+
+extern txn_atom *get_current_atom_locked_nocheck(void);
+
+extern struct atom_brick_info *alloc_atom_brick_info(void);
+extern void free_atom_brick_info(struct atom_brick_info *abi);
+static inline void init_atom_brick_info(struct atom_brick_info *abi,
+ u32 brick_id)
+{
+ memset(abi, 0, sizeof(*abi));
+ RB_CLEAR_NODE(&abi->node);
+ abi->brick_id = brick_id;
+}
+
+static inline struct atom_brick_info *atom_meta_brick_info(txn_atom *atom)
+{
+ return &atom->mabi;
+}
+
+extern struct atom_brick_info *find_atom_brick_info(const struct rb_root *root,
+ u32 brick_id);
+extern struct atom_brick_info *insert_atom_brick_info(struct rb_root *root,
+ struct atom_brick_info *abi);
+extern int __check_insert_atom_brick_info(txn_atom **atom, u32 brick_id,
+ struct atom_brick_info **abi);
+extern int check_insert_atom_brick_info(u32 brick_id,
+ struct atom_brick_info **abi);
+
+#if REISER4_DEBUG
+extern void check_atom_flush_reserved(txn_atom *atom);
+extern void __check_atom_brick_info(struct rb_root *root);
+extern void check_atom_brick_info(txn_atom *atom);
+#else
+#define check_atom_flush_reserved(atom) noop
+#define __check_atom_brick_info(root) noop
+#define check_atom_brick_info(atom) noop
+#endif
+
+#if REISER4_DEBUG
+
+/**
+ * atom_is_protected - make sure that nobody but us can do anything with atom
+ * @atom: atom to be checked
+ *
+ * This is used to assert that atom either entered commit stages or is spin
+ * locked.
+ */
+static inline int atom_is_protected(txn_atom *atom)
+{
+ if (atom->stage >= ASTAGE_PRE_COMMIT)
+ return 1;
+ assert_spin_locked(&(atom->alock));
+ return 1;
+}
+
+#endif
+
+/* Get the current atom and spinlock it if current atom present. May not return NULL */
+static inline txn_atom *get_current_atom_locked(void)
+{
+ txn_atom *atom;
+
+ atom = get_current_atom_locked_nocheck();
+ assert("zam-761", atom != NULL);
+
+ return atom;
+}
+
+extern txn_atom *jnode_get_atom(jnode *);
+
+extern void reiser4_atom_wait_event(txn_atom *);
+extern void reiser4_atom_send_event(txn_atom *);
+
+extern void insert_into_atom_ovrwr_list(txn_atom *atom, jnode *node);
+extern void insert_into_subv_ovrwr_list(reiser4_subvol *subv, jnode *node,
+ txn_atom *atom);
+
+extern int capture_brick_super(reiser4_subvol *subv);
+int capture_bulk(jnode **, int count);
+
+/* See the comment on the function blocknrset.c:blocknr_set_add for the
+ calling convention of these three routines. */
+extern int blocknr_set_init_static(void);
+extern void blocknr_set_done_static(void);
+extern void blocknr_set_init(struct list_head * bset);
+extern void blocknr_set_destroy(struct list_head * bset);
+extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
+extern int blocknr_set_add_extent(txn_atom * atom,
+ struct list_head * bset,
+ blocknr_set_entry ** new_bsep,
+ const reiser4_block_nr * start,
+ const reiser4_block_nr * len,
+ const __u32 subvol_id);
+extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
+ blocknr_set_entry ** new_bsep,
+ const reiser4_block_nr * a,
+ const reiser4_block_nr * b,
+ __u32 subvol_id);
+
+typedef int (*blocknr_set_actor_f) (txn_atom *,
+ const reiser4_block_nr *,
+ const reiser4_block_nr *,
+ __u32,
+ void *);
+
+extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
+ blocknr_set_actor_f actor, void *data,
+ int delete, u32 subv_id);
+
+/* This is the block list interface (see blocknrlist.c) */
+extern int blocknr_list_init_static(void);
+extern void blocknr_list_done_static(void);
+extern void blocknr_list_init(struct list_head *blist);
+extern void blocknr_list_destroy(struct list_head *blist);
+extern void blocknr_list_merge(struct list_head *from, struct list_head *to);
+extern void blocknr_list_sort_and_join(struct list_head *blist);
+/**
+ * The @atom should be locked.
+ */
+extern int blocknr_list_add_extent(txn_atom *atom,
+ struct list_head *blist,
+ blocknr_list_entry **new_entry,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ __u32 subvol_id);
+extern int blocknr_list_iterator(txn_atom *atom,
+ struct list_head *blist,
+ blocknr_set_actor_f actor,
+ void *data,
+ int delete);
+
+/* These are wrappers for accessing and modifying atom's delete lists,
+ depending on whether discard is enabled or not.
+ If it is enabled, (less memory efficient) blocknr_list is used for delete
+ list storage. Otherwise, blocknr_set is used for this purpose. */
+extern void atom_dset_init(txn_atom *atom);
+extern void atom_dset_destroy(txn_atom *atom);
+extern void atom_dset_merge(txn_atom *from, txn_atom *to);
+extern int atom_dset_deferred_apply(txn_atom* atom,
+ blocknr_set_actor_f actor,
+ void *data,
+ int delete);
+extern int atom_dset_deferred_add_extent(txn_atom *atom,
+ void **new_entry,
+ const reiser4_block_nr *start,
+ const reiser4_block_nr *len,
+ const __u32 subvol_id);
+
+/* flush code takes care about how to fuse flush queues */
+extern void flush_init_atom(txn_atom * atom);
+extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
+
+static inline void spin_lock_atom(txn_atom *atom)
+{
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(spin_locked_atom) &&
+ LOCK_CNT_NIL(spin_locked_jnode) &&
+ LOCK_CNT_NIL(spin_locked_zlock) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(rw_locked_tree)));
+
+ spin_lock(&(atom->alock));
+
+ LOCK_CNT_INC(spin_locked_atom);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline void spin_lock_atom_nested(txn_atom *atom)
+{
+ assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(spin_locked_jnode) &&
+ LOCK_CNT_NIL(spin_locked_zlock) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(rw_locked_tree)));
+
+ spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
+
+ LOCK_CNT_INC(spin_locked_atom);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline int spin_trylock_atom(txn_atom *atom)
+{
+ if (spin_trylock(&(atom->alock))) {
+ LOCK_CNT_INC(spin_locked_atom);
+ LOCK_CNT_INC(spin_locked);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void spin_unlock_atom(txn_atom *atom)
+{
+ assert_spin_locked(&(atom->alock));
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(spin_locked_atom);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&(atom->alock));
+}
+
+static inline void spin_lock_txnh(txn_handle *txnh)
+{
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(spin_locked_zlock) &&
+ LOCK_CNT_NIL(rw_locked_tree)));
+
+ spin_lock(&(txnh->hlock));
+
+ LOCK_CNT_INC(spin_locked_txnh);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline int spin_trylock_txnh(txn_handle *txnh)
+{
+ if (spin_trylock(&(txnh->hlock))) {
+ LOCK_CNT_INC(spin_locked_txnh);
+ LOCK_CNT_INC(spin_locked);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void spin_unlock_txnh(txn_handle *txnh)
+{
+ assert_spin_locked(&(txnh->hlock));
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(spin_locked_txnh);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&(txnh->hlock));
+}
+
+#define spin_ordering_pred_txnmgr(tmgr) \
+ ( LOCK_CNT_NIL(spin_locked_atom) && \
+ LOCK_CNT_NIL(spin_locked_txnh) && \
+ LOCK_CNT_NIL(spin_locked_jnode) && \
+ LOCK_CNT_NIL(rw_locked_zlock) && \
+ LOCK_CNT_NIL(rw_locked_dk) && \
+ LOCK_CNT_NIL(rw_locked_tree) )
+
+static inline void spin_lock_txnmgr(txn_mgr *mgr)
+{
+ /* check that spinlocks of lower priorities are not held */
+ assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
+ LOCK_CNT_NIL(spin_locked_txnh) &&
+ LOCK_CNT_NIL(spin_locked_jnode) &&
+ LOCK_CNT_NIL(spin_locked_zlock) &&
+ LOCK_CNT_NIL(rw_locked_dk) &&
+ LOCK_CNT_NIL(rw_locked_tree)));
+
+ spin_lock(&(mgr->tmgr_lock));
+
+ LOCK_CNT_INC(spin_locked_txnmgr);
+ LOCK_CNT_INC(spin_locked);
+}
+
+static inline int spin_trylock_txnmgr(txn_mgr *mgr)
+{
+ if (spin_trylock(&(mgr->tmgr_lock))) {
+ LOCK_CNT_INC(spin_locked_txnmgr);
+ LOCK_CNT_INC(spin_locked);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void spin_unlock_txnmgr(txn_mgr *mgr)
+{
+ assert_spin_locked(&(mgr->tmgr_lock));
+ assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
+ assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
+
+ LOCK_CNT_DEC(spin_locked_txnmgr);
+ LOCK_CNT_DEC(spin_locked);
+
+ spin_unlock(&(mgr->tmgr_lock));
+}
+
+typedef enum {
+ FQ_IN_USE = 0x1
+} flush_queue_state_t;
+
+typedef struct flush_queue flush_queue_t;
+
+/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
+ is filled by the jnode_flush() routine, and written to disk under memory
+ pressure or at atom commit time. */
+/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
+ field and fq->prepped list can be modified if atom is spin-locked and fq
+ object is "in-use" state. For read-only traversal of the fq->prepped list
+ and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
+ only have atom spin-locked. */
+struct flush_queue {
+ /* linkage element is the first in this structure to make debugging
+ easier. See field in atom struct for description of list. */
+ struct list_head alink;
+ /* A spinlock to protect changes of fq state and fq->atom pointer */
+ spinlock_t guard;
+ /* flush_queue state: [in_use | ready] */
+ flush_queue_state_t state;
+ /* A list which contains queued nodes, queued nodes are removed from any
+ * atom's list and put on this ->prepped one. */
+ struct list_head prepped;
+ /* number of submitted i/o requests */
+ atomic_t nr_submitted;
+ /* number of i/o errors */
+ atomic_t nr_errors;
+ /* An atom this flush queue is attached to */
+ txn_atom *atom;
+ /* A wait queue head to wait on i/o completion */
+ wait_queue_head_t wait;
+#if REISER4_DEBUG
+ /* A thread which took this fq in exclusive use, NULL if fq is free,
+ * used for debugging. */
+ struct task_struct *owner;
+#endif
+};
+
+extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
+extern void reiser4_fq_put_nolock(flush_queue_t *);
+extern void reiser4_fq_put(flush_queue_t *);
+extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
+extern void queue_jnode(flush_queue_t *, jnode *);
+
+extern int reiser4_write_fq(flush_queue_t *, long *, int);
+extern int current_atom_finish_all_fq(void);
+extern void init_atom_fq_parts(txn_atom *);
+
+extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
+
+extern void znode_make_dirty(znode * node);
+extern void jnode_make_dirty_locked(jnode * node);
+
+extern int reiser4_sync_atom(txn_atom * atom);
+
+#if REISER4_DEBUG
+extern int atom_fq_parts_are_clean(txn_atom *);
+#endif
+
+extern void add_fq_to_bio(flush_queue_t *, struct bio *);
+extern flush_queue_t *get_fq_for_current_atom(void);
+
+void reiser4_invalidate_list(struct list_head * head);
+
+# endif /* __REISER4_TXNMGR_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/type_safe_hash.h linux-5.10.2/fs/reiser4/type_safe_hash.h
--- linux-5.10.2.orig/fs/reiser4/type_safe_hash.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/type_safe_hash.h 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,320 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* A hash table class that uses hash chains (singly-linked) and is
+ parametrized to provide type safety. */
+
+#ifndef __REISER4_TYPE_SAFE_HASH_H__
+#define __REISER4_TYPE_SAFE_HASH_H__
+
+#include "debug.h"
+
+#include <asm/errno.h>
+/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
+ based on the object type. You need to declare the item type before
+ this definition, define it after this definition. */
+#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE) \
+ \
+typedef struct PREFIX##_hash_table_ PREFIX##_hash_table; \
+typedef struct PREFIX##_hash_link_ PREFIX##_hash_link; \
+ \
+struct PREFIX##_hash_table_ \
+{ \
+ ITEM_TYPE **_table; \
+ __u32 _buckets; \
+}; \
+ \
+struct PREFIX##_hash_link_ \
+{ \
+ ITEM_TYPE *_next; \
+}
+
+/* Step 2: Define the object type of the hash: give it field of type
+ PREFIX_hash_link. */
+
+/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
+ the type and field name used in step 3. The arguments are:
+
+ ITEM_TYPE The item type being hashed
+ KEY_TYPE The type of key being hashed
+ KEY_NAME The name of the key field within the item
+ LINK_NAME The name of the link field within the item, which you must make type PREFIX_hash_link)
+ HASH_FUNC The name of the hash function (or macro, takes const pointer to key)
+ EQ_FUNC The name of the equality function (or macro, takes const pointer to two keys)
+
+ It implements these functions:
+
+ prefix_hash_init Initialize the table given its size.
+ prefix_hash_insert Insert an item
+ prefix_hash_insert_index Insert an item w/ precomputed hash_index
+ prefix_hash_find Find an item by key
+ prefix_hash_find_index Find an item w/ precomputed hash_index
+ prefix_hash_remove Remove an item, returns 1 if found, 0 if not found
+ prefix_hash_remove_index Remove an item w/ precomputed hash_index
+
+ If you'd like something to be done differently, feel free to ask me
+ for modifications. Additional features that could be added but
+ have not been:
+
+ prefix_hash_remove_key Find and remove an item by key
+ prefix_hash_remove_key_index Find and remove an item by key w/ precomputed hash_index
+
+ The hash_function currently receives only the key as an argument,
+ meaning it must somehow know the number of buckets. If this is a
+ problem let me know.
+
+ This hash table uses a single-linked hash chain. This means
+ insertion is fast but deletion requires searching the chain.
+
+ There is also the doubly-linked hash chain approach, under which
+ deletion requires no search but the code is longer and it takes two
+ pointers per item.
+
+ The circularly-linked approach has the shortest code but requires
+ two pointers per bucket, doubling the size of the bucket array (in
+ addition to two pointers per item).
+*/
+#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC) \
+ \
+static __inline__ void \
+PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG, \
+ __u32 hash UNUSED_ARG) \
+{ \
+ assert("nikita-2780", hash < table->_buckets); \
+} \
+ \
+static __inline__ int \
+PREFIX##_hash_init (PREFIX##_hash_table *hash, \
+ __u32 buckets) \
+{ \
+ hash->_table = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets); \
+ hash->_buckets = buckets; \
+ if (hash->_table == NULL) \
+ { \
+ return RETERR(-ENOMEM); \
+ } \
+ memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets); \
+ ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets)); \
+ return 0; \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_done (PREFIX##_hash_table *hash) \
+{ \
+ if (REISER4_DEBUG && hash->_table != NULL) { \
+ __u32 i; \
+ for (i = 0 ; i < hash->_buckets ; ++ i) \
+ assert("nikita-2905", hash->_table[i] == NULL); \
+ } \
+ if (hash->_table != NULL) \
+ KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets); \
+ hash->_table = NULL; \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_prefetch_next (ITEM_TYPE *item) \
+{ \
+ prefetch(item->LINK_NAME._next); \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash, \
+ __u32 index) \
+{ \
+ prefetch(hash->_table[index]); \
+} \
+ \
+static __inline__ ITEM_TYPE* \
+PREFIX##_hash_find_index (PREFIX##_hash_table *hash, \
+ __u32 hash_index, \
+ KEY_TYPE const *find_key) \
+{ \
+ ITEM_TYPE *item; \
+ \
+ PREFIX##_check_hash(hash, hash_index); \
+ \
+ for (item = hash->_table[hash_index]; \
+ item != NULL; \
+ item = item->LINK_NAME._next) \
+ { \
+ prefetch(item->LINK_NAME._next); \
+ prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME)); \
+ if (EQ_FUNC (& item->KEY_NAME, find_key)) \
+ { \
+ return item; \
+ } \
+ } \
+ \
+ return NULL; \
+} \
+ \
+static __inline__ ITEM_TYPE* \
+PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash, \
+ __u32 hash_index, \
+ KEY_TYPE const *find_key) \
+{ \
+ ITEM_TYPE ** item = &hash->_table[hash_index]; \
+ \
+ PREFIX##_check_hash(hash, hash_index); \
+ \
+ while (*item != NULL) { \
+ prefetch(&(*item)->LINK_NAME._next); \
+ if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) { \
+ ITEM_TYPE *found; \
+ \
+ found = *item; \
+ *item = found->LINK_NAME._next; \
+ found->LINK_NAME._next = hash->_table[hash_index]; \
+ hash->_table[hash_index] = found; \
+ return found; \
+ } \
+ item = &(*item)->LINK_NAME._next; \
+ } \
+ return NULL; \
+} \
+ \
+static __inline__ int \
+PREFIX##_hash_remove_index (PREFIX##_hash_table *hash, \
+ __u32 hash_index, \
+ ITEM_TYPE *del_item) \
+{ \
+ ITEM_TYPE ** hash_item_p = &hash->_table[hash_index]; \
+ \
+ PREFIX##_check_hash(hash, hash_index); \
+ \
+ while (*hash_item_p != NULL) { \
+ prefetch(&(*hash_item_p)->LINK_NAME._next); \
+ if (*hash_item_p == del_item) { \
+ *hash_item_p = (*hash_item_p)->LINK_NAME._next; \
+ return 1; \
+ } \
+ hash_item_p = &(*hash_item_p)->LINK_NAME._next; \
+ } \
+ return 0; \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_insert_index (PREFIX##_hash_table *hash, \
+ __u32 hash_index, \
+ ITEM_TYPE *ins_item) \
+{ \
+ PREFIX##_check_hash(hash, hash_index); \
+ \
+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
+ hash->_table[hash_index] = ins_item; \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash, \
+ __u32 hash_index, \
+ ITEM_TYPE *ins_item) \
+{ \
+ PREFIX##_check_hash(hash, hash_index); \
+ \
+ ins_item->LINK_NAME._next = hash->_table[hash_index]; \
+ smp_wmb(); \
+ hash->_table[hash_index] = ins_item; \
+} \
+ \
+static __inline__ ITEM_TYPE* \
+PREFIX##_hash_find (PREFIX##_hash_table *hash, \
+ KEY_TYPE const *find_key) \
+{ \
+ return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key); \
+} \
+ \
+static __inline__ ITEM_TYPE* \
+PREFIX##_hash_find_lru (PREFIX##_hash_table *hash, \
+ KEY_TYPE const *find_key) \
+{ \
+ return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key); \
+} \
+ \
+static __inline__ int \
+PREFIX##_hash_remove (PREFIX##_hash_table *hash, \
+ ITEM_TYPE *del_item) \
+{ \
+ return PREFIX##_hash_remove_index (hash, \
+ HASH_FUNC(hash, &del_item->KEY_NAME), del_item); \
+} \
+ \
+static __inline__ int \
+PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash, \
+ ITEM_TYPE *del_item) \
+{ \
+ return PREFIX##_hash_remove (hash, del_item); \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_insert (PREFIX##_hash_table *hash, \
+ ITEM_TYPE *ins_item) \
+{ \
+ return PREFIX##_hash_insert_index (hash, \
+ HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item); \
+} \
+ \
+static __inline__ void \
+PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash, \
+ ITEM_TYPE *ins_item) \
+{ \
+ return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME), \
+ ins_item); \
+} \
+ \
+static __inline__ ITEM_TYPE * \
+PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind) \
+{ \
+ ITEM_TYPE *first; \
+ \
+ for (first = NULL; ind < hash->_buckets; ++ ind) { \
+ first = hash->_table[ind]; \
+ if (first != NULL) \
+ break; \
+ } \
+ return first; \
+} \
+ \
+static __inline__ ITEM_TYPE * \
+PREFIX##_hash_next (PREFIX##_hash_table *hash, \
+ ITEM_TYPE *item) \
+{ \
+ ITEM_TYPE *next; \
+ \
+ if (item == NULL) \
+ return NULL; \
+ next = item->LINK_NAME._next; \
+ if (next == NULL) \
+ next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1); \
+ return next; \
+} \
+ \
+typedef struct {} PREFIX##_hash_dummy
+
+#define for_all_ht_buckets(table, head) \
+for ((head) = &(table) -> _table[ 0 ] ; \
+ (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
+
+#define for_all_in_bucket(bucket, item, next, field) \
+for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ; \
+ (item) != NULL ; \
+ (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
+
+#define for_all_in_htable(table, prefix, item, next) \
+for ((item) = prefix ## _hash_first ((table), 0), \
+ (next) = prefix ## _hash_next ((table), (item)) ; \
+ (item) != NULL ; \
+ (item) = (next), \
+ (next) = prefix ## _hash_next ((table), (item)))
+
+/* __REISER4_TYPE_SAFE_HASH_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/vfs_ops.c linux-5.10.2/fs/reiser4/vfs_ops.c
--- linux-5.10.2.orig/fs/reiser4/vfs_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/vfs_ops.c 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,261 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
+ here. */
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "coord.h"
+#include "plugin/item/item.h"
+#include "plugin/file/file.h"
+#include "plugin/security/perm.h"
+#include "plugin/disk_format/disk_format.h"
+#include "plugin/plugin.h"
+#include "plugin/plugin_set.h"
+#include "plugin/object.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "vfs_ops.h"
+#include "inode.h"
+#include "page_cache.h"
+#include "ktxnmgrd.h"
+#include "super.h"
+#include "reiser4.h"
+#include "entd.h"
+#include "status_flags.h"
+#include "flush.h"
+#include "dscale.h"
+
+#include <linux/profile.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#include <linux/mm.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/security.h>
+#include <linux/reboot.h>
+#include <linux/rcupdate.h>
+
+/* update inode stat-data by calling plugin */
+int reiser4_update_sd(struct inode *object)
+{
+ file_plugin *fplug;
+
+ assert("nikita-2338", object != NULL);
+
+ /* check for read-only file system. */
+ if (IS_RDONLY(object))
+ return 0;
+
+ fplug = inode_file_plugin(object);
+ assert("nikita-2339", fplug != NULL);
+ return fplug->write_sd_by_inode(object, NULL);
+}
+
+/* helper function: increase inode nlink count and call plugin method to save
+ updated stat-data.
+
+ Used by link/create and during creation of dot and dotdot in mkdir
+*/
+int reiser4_add_nlink(struct inode *object, /* object to which link is added */
+ struct inode *parent, /* parent where new entry will be */
+ int write_sd_p /* true if stat-data has to be
+ * updated */ )
+{
+ file_plugin *fplug;
+ int result;
+
+ assert("nikita-1351", object != NULL);
+
+ fplug = inode_file_plugin(object);
+ assert("nikita-1445", fplug != NULL);
+
+ /* ask plugin whether it can add yet another link to this
+ object */
+ if (!fplug->can_add_link(object))
+ return RETERR(-EMLINK);
+
+ assert("nikita-2211", fplug->add_link != NULL);
+ /* call plugin to do actual addition of link */
+ result = fplug->add_link(object, parent);
+
+ /* optionally update stat data */
+ if (result == 0 && write_sd_p)
+ result = fplug->write_sd_by_inode(object, NULL);
+ return result;
+}
+
+/* helper function: decrease inode nlink count and call plugin method to save
+ updated stat-data.
+
+ Used by unlink/create
+*/
+int reiser4_del_nlink(struct inode *object /* object from which link is
+ * removed */ ,
+ struct inode *parent /* parent where entry was */ ,
+ int write_sd_p /* true is stat-data has to be
+ * updated */ )
+{
+ file_plugin *fplug;
+ int result;
+
+ assert("nikita-1349", object != NULL);
+
+ fplug = inode_file_plugin(object);
+ assert("nikita-1350", fplug != NULL);
+ assert("nikita-1446", object->i_nlink > 0);
+ assert("nikita-2210", fplug->rem_link != NULL);
+
+ /* call plugin to do actual deletion of link */
+ result = fplug->rem_link(object, parent);
+
+ /* optionally update stat data */
+ if (result == 0 && write_sd_p)
+ result = fplug->write_sd_by_inode(object, NULL);
+ return result;
+}
+
+/* Release reiser4 dentry. This is d_op->d_release() method. */
+static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
+{
+ reiser4_free_dentry_fsdata(dentry);
+}
+
+/*
+ * Called by reiser4_sync_inodes(), during speculative write-back (through
+ * pdflush, or balance_dirty_pages()).
+ */
+void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
+{
+ long written = 0;
+ int repeats = 0;
+ int result;
+
+ /*
+ * Performs early flushing, trying to free some memory. If there
+ * is nothing to flush, commits some atoms.
+ *
+ * Commit all atoms if reiser4_writepages_dispatch() is called
+ * from sys_sync() or sys_fsync()
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ txnmgr_force_commit_all(sb, 0);
+ return;
+ }
+
+ BUG_ON(reiser4_get_super_fake(sb) == NULL);
+ do {
+ long nr_submitted = 0;
+ jnode *node = NULL;
+
+ /* do not put more requests to overload write queue */
+ if (bdi_write_congested(inode_to_bdi(reiser4_get_super_fake(sb)))) {
+ //blk_flush_plug(current);
+ break;
+ }
+ repeats++;
+ BUG_ON(wbc->nr_to_write <= 0);
+
+ if (get_current_context()->entd) {
+ entd_context *ent = get_entd_context(sb);
+
+ if (ent->cur_request->node)
+ /*
+ * this is ent thread and it managed to capture
+ * requested page itself - start flush from
+ * that page
+ */
+ node = ent->cur_request->node;
+ }
+
+ result = flush_some_atom(node, &nr_submitted, wbc,
+ JNODE_FLUSH_WRITE_BLOCKS);
+ if (result != 0)
+ warning("nikita-31001", "Flush failed: %i", result);
+ if (node)
+ /* drop the reference aquired
+ in find_or_create_extent() */
+ jput(node);
+ if (!nr_submitted)
+ break;
+
+ wbc->nr_to_write -= nr_submitted;
+ written += nr_submitted;
+ } while (wbc->nr_to_write > 0);
+}
+
+/* tell VM how many pages were dirtied */
+void reiser4_throttle_write(struct inode *inode)
+{
+ reiser4_context *ctx;
+
+ ctx = get_current_context();
+ reiser4_txn_restart(ctx);
+ current->journal_info = NULL;
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ current->journal_info = ctx;
+}
+
+const int REISER4_MAGIC_OFFSET = 16 * 4096; /* offset to magic string from the
+ * beginning of device */
+
+/*
+ * Reiser4 initialization/shutdown.
+ *
+ * Code below performs global reiser4 initialization that is done either as
+ * part of kernel initialization (when reiser4 is statically built-in), or
+ * during reiser4 module load (when compiled as module).
+ */
+
+void reiser4_handle_error(void)
+{
+ struct super_block *sb = reiser4_get_current_sb();
+
+ if (!sb)
+ return;
+ reiser4_status_write(get_meta_subvol(),
+ REISER4_STATUS_DAMAGED, 0,
+ "Filesystem error occured");
+ switch (get_super_private(sb)->onerror) {
+ case 1:
+ reiser4_panic("foobar-42", "Filesystem error occured\n");
+ default:
+ if (sb_rdonly(sb))
+ return;
+ sb->s_flags |= SB_RDONLY;
+ break;
+ }
+}
+
+struct dentry_operations reiser4_dentry_operations = {
+ .d_revalidate = NULL,
+ .d_hash = NULL,
+ .d_compare = NULL,
+ .d_delete = NULL,
+ .d_release = reiser4_d_release,
+ .d_iput = NULL,
+};
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/vfs_ops.h linux-5.10.2/fs/reiser4/vfs_ops.h
--- linux-5.10.2.orig/fs/reiser4/vfs_ops.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/vfs_ops.h 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,60 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* vfs_ops.c's exported symbols */
+
+#if !defined( __FS_REISER4_VFS_OPS_H__ )
+#define __FS_REISER4_VFS_OPS_H__
+
+#include "forward.h"
+#include "coord.h"
+#include "seal.h"
+#include "plugin/file/file.h"
+#include "super.h"
+#include "readahead.h"
+
+#include <linux/types.h> /* for loff_t */
+#include <linux/fs.h> /* for struct address_space */
+#include <linux/dcache.h> /* for struct dentry */
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+
+/* address space operations */
+int reiser4_writepage(struct page *, struct writeback_control *);
+int reiser4_set_page_dirty(struct page *);
+void reiser4_invalidatepage(struct page *, unsigned int offset, unsigned int length);
+int reiser4_releasepage(struct page *, gfp_t);
+
+#ifdef CONFIG_MIGRATION
+int reiser4_migratepage(struct address_space *, struct page *,
+ struct page *, enum migrate_mode);
+#else
+#define reiser4_migratepage NULL
+#endif /* CONFIG_MIGRATION */
+
+extern int reiser4_update_sd(struct inode *);
+extern int reiser4_add_nlink(struct inode *, struct inode *, int);
+extern int reiser4_del_nlink(struct inode *, struct inode *, int);
+
+extern int reiser4_start_up_io(struct page *page);
+extern void reiser4_throttle_write(struct inode *);
+extern int jnode_is_releasable(jnode *);
+
+#define CAPTURE_APAGE_BURST (1024l)
+void reiser4_writeout(struct super_block *, struct writeback_control *);
+
+extern void reiser4_handle_error(void);
+
+/* __FS_REISER4_VFS_OPS_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/volume_ops.c linux-5.10.2/fs/reiser4/volume_ops.c
--- linux-5.10.2.orig/fs/reiser4/volume_ops.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/volume_ops.c 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,639 @@
+/*
+ Copyright (c) 2017-2020 Eduard O. Shishkin
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "debug.h"
+#include "super.h"
+#include "inode.h"
+#include "plugin/volume/volume.h"
+
+static int reiser4_register_brick(struct reiser4_vol_op_args *args)
+{
+ reiser4_volume *host = NULL;
+
+ return reiser4_scan_device(args->d.name, FMODE_READ,
+ get_reiser4_fs_type(), NULL, &host);
+}
+
+static int reiser4_print_volume(struct super_block *sb,
+ struct reiser4_vol_op_args *args)
+{
+ return super_vol_plug(sb)->print_volume(sb, args);
+}
+
+static int reiser4_print_brick(struct super_block *sb,
+ struct reiser4_vol_op_args *args)
+{
+ return super_vol_plug(sb)->print_brick(sb, args);
+}
+
+/**
+ * find activated brick by @name
+ */
+static reiser4_subvol *find_active_brick(struct super_block *super,
+ char *name)
+{
+ u32 subv_id;
+ reiser4_subvol *result = NULL;
+ lv_conf *conf = super_conf(super);
+
+ for_each_mslot(conf, subv_id) {
+ if (!conf_mslot_at(conf, subv_id))
+ continue;
+ if (!strcmp(conf_origin(conf, subv_id)->name, name)) {
+ result = conf_origin(conf, subv_id);
+ break;
+ }
+ }
+ return result;
+}
+
+static int reiser4_resize_brick(struct super_block *sb,
+ struct reiser4_vol_op_args *args)
+{
+ int ret;
+ reiser4_subvol *this;
+ int need_balance;
+
+ if (reiser4_volume_has_incomplete_removal(sb)) {
+ warning("edward-2166",
+ "Failed to resize brick (%s has incomplete removal)",
+ sb->s_id);
+ return -EBUSY;
+ }
+ if (args->new_capacity == 0) {
+ warning("edward-2395", "Can not resize brick to zero.");
+ return -EINVAL;
+ }
+ this = find_active_brick(sb, args->d.name);
+ if (!this) {
+ warning("edward-2148",
+ "Brick %s doesn't belong to volume %s. Can not resize.",
+ args->d.name,
+ reiser4_get_current_sb()->s_id);
+ return -EINVAL;
+ }
+ if (args->new_capacity == this->data_capacity)
+ /* nothing to do */
+ return 0;
+ ret = super_vol_plug(sb)->resize_brick(super_volume(sb),
+ this,
+ args->new_capacity - this->data_capacity,
+ &need_balance);
+ if (ret)
+ /* resize operation should be repeated in regular context */
+ return ret;
+
+ if (!(args->flags & COMPLETE_WITH_BALANCE))
+ return 0;
+
+ if (!need_balance)
+ return 0;
+
+ ret = super_vol_plug(sb)->balance_volume(sb, 0);
+ if (ret)
+ return ret;
+ /*
+ * clear unbalanced status on disk
+ */
+ reiser4_volume_clear_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ return force_commit_current_atom();
+}
+
+static int reiser4_add_brick(struct super_block *sb,
+ struct reiser4_vol_op_args *args, int add_proxy)
+{
+ int ret;
+ reiser4_volume *vol = super_volume(sb);
+ int activated_here = 0;
+ reiser4_subvol *new = NULL;
+ reiser4_volume *host_of_new = NULL;
+
+ if (reiser4_volume_has_incomplete_removal(sb)) {
+ warning("edward-2167",
+ "Failed to add brick (%s has incomplete removal)",
+ sb->s_id);
+ return -EBUSY;
+ }
+ /*
+ * register new brick
+ */
+ ret = reiser4_scan_device(args->d.name, FMODE_READ,
+ get_reiser4_fs_type(), &new, &host_of_new);
+ if (ret)
+ return ret;
+
+ assert("edward-1969", new != NULL);
+ assert("edward-1970", host_of_new != NULL);
+
+ if (host_of_new != vol) {
+ warning("edward-1971",
+ "Failed to add brick (Inappropriate volume)");
+ return -EINVAL;
+ }
+ if (!subvol_is_set(new, SUBVOL_ACTIVATED)) {
+ new->flags |= (1 << SUBVOL_IS_ORPHAN);
+
+ ret = reiser4_activate_subvol(sb, new);
+ if (ret)
+ return ret;
+ activated_here = 1;
+ }
+ if (add_proxy) {
+ if (brick_belongs_volume(vol, new) && is_proxy_brick(new)) {
+ warning("edward-2435",
+ "Can't add second proxy brick to the volume");
+ return -EINVAL;
+ }
+ assert("edward-2449",
+ ergo(!is_meta_brick(new),
+ subvol_is_set(new, SUBVOL_HAS_DATA_ROOM)));
+
+ new->flags |= (1 << SUBVOL_IS_PROXY);
+ }
+ ret = vol->vol_plug->add_brick(vol, new);
+ if (ret) {
+ /*
+ * operation of adding a brick should be repeated
+ * in regular context
+ */
+ if (activated_here) {
+ reiser4_deactivate_subvol(sb, new);
+ reiser4_unregister_subvol(new);
+ }
+ return ret;
+ }
+ /*
+ * new volume configuration has been written to disk,
+ * so release all volinfo jnodes - they are not needed
+ * any more
+ */
+ release_volinfo_nodes(&vol->volinfo[CUR_VOL_CONF], 0);
+ clear_bit(SUBVOL_IS_ORPHAN, &new->flags);
+
+ if (!(args->flags & COMPLETE_WITH_BALANCE))
+ return 0;
+
+ ret = vol->vol_plug->balance_volume(sb, 0);
+ if (ret)
+ return ret;
+ /* clear unbalanced status on disk */
+
+ reiser4_volume_clear_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ return force_commit_current_atom();
+}
+
+static void reiser4_detach_brick(reiser4_subvol *victim)
+{
+ struct ctx_brick_info *cbi;
+ reiser4_context *ctx = get_current_context();
+ struct rb_root *root = &ctx->bricks_info;
+ reiser4_super_info_data *sbinfo = get_current_super_private();
+
+ cbi = find_context_brick_info(ctx, victim->id);
+
+ assert("edward-2257", cbi != NULL);
+
+ __grabbed2free(cbi, sbinfo, cbi->grabbed_blocks, victim);
+
+ rb_erase(&cbi->node, root);
+ RB_CLEAR_NODE(&cbi->node);
+ free_context_brick_info(cbi);
+
+ victim->id = INVALID_SUBVOL_ID;
+ victim->flags |= (1 << SUBVOL_IS_ORPHAN);
+ reiser4_deactivate_subvol(victim->super, victim);
+ reiser4_unregister_subvol(victim);
+}
+
+static int reiser4_finish_removal(struct super_block *sb, reiser4_volume *vol);
+
+static int reiser4_remove_brick(struct super_block *sb,
+ struct reiser4_vol_op_args *args)
+{
+ int ret;
+ reiser4_volume *vol = super_volume(sb);
+ reiser4_subvol *victim;
+
+ if (reiser4_volume_has_incomplete_removal(sb)) {
+ warning("edward-2168",
+ "Failed to remove brick (%s has incomplete removal)",
+ sb->s_id);
+ return -EBUSY;
+ }
+ victim = find_active_brick(sb, args->d.name);
+ if (!victim) {
+ warning("edward-2149",
+ "Brick %s doesn't belong to volume %s. Can not remove.",
+ args->d.name,
+ reiser4_get_current_sb()->s_id);
+ return -EINVAL;
+ }
+ ret = vol->vol_plug->remove_brick(vol, victim);
+ if (ret)
+ return ret;
+ printk("reiser4 (%s): Brick %s scheduled for removal.\n",
+ sb->s_id, victim->name);
+
+ release_volinfo_nodes(&vol->volinfo[CUR_VOL_CONF], 0);
+
+ return reiser4_finish_removal(sb, vol);
+}
+
+static int reiser4_scale_volume(struct super_block *sb,
+ struct reiser4_vol_op_args *args)
+{
+ int ret;
+ reiser4_volume *vol = super_volume(sb);
+
+ if (reiser4_volume_has_incomplete_removal(sb)) {
+ warning("edward-2168",
+ "Failed to scale volume %s with incomplete removal)",
+ sb->s_id);
+ return -EBUSY;
+ }
+ if (args->s.val == 0)
+ return 0;
+ ret = super_volume(sb)->vol_plug->scale_volume(sb, args->s.val);
+ if (ret)
+ return ret;
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ /*
+ * write unbalanced status to disk
+ */
+ ret = force_commit_current_atom();
+ if (ret)
+ return ret;
+ /*
+ * new volume configuration has been written to disk,
+ * so release all volinfo jnodes - they are not needed
+ * any more
+ */
+ release_volinfo_nodes(&vol->volinfo[CUR_VOL_CONF], 0);
+
+ printk("reiser4 (%s): Volume has beed scaled in %u times.",
+ sb->s_id, 1 << args->s.val);
+
+ if (!(args->flags & COMPLETE_WITH_BALANCE))
+ return 0;
+
+ ret = vol->vol_plug->balance_volume(sb, 0);
+ if (ret)
+ return ret;
+ /* clear unbalanced status on disk */
+
+ reiser4_volume_clear_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ return force_commit_current_atom();
+}
+
+/**
+ * We allow more than one balancing threads on the same volume. Note, however,
+ * that it would be inefficient: others will be always going after one leader
+ * without doing useful work.
+ * Pre-condition: volume is read locked
+ */
+static int reiser4_balance_volume(struct super_block *sb, u32 flags)
+{
+ reiser4_volume *vol = super_volume(sb);
+ int ret;
+
+ ret = vol->vol_plug->balance_volume(sb, flags);
+ if (ret)
+ return ret;
+ reiser4_volume_clear_unbalanced(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ return ret;
+ return force_commit_current_atom();
+}
+
+/**
+ * Pre-condition: exclusive access to the volume should be held
+ */
+static int reiser4_finish_removal(struct super_block *sb, reiser4_volume *vol)
+{
+ int ret;
+ reiser4_subvol *victim;
+
+ if (!reiser4_volume_has_incomplete_removal(sb))
+ return 0;
+
+ victim = vol->victim;
+ if (!victim)
+ goto cleanup;
+ if (reiser4_volume_is_unbalanced(sb)) {
+ /*
+ * Move out all data blocks from @victim to the
+ * remaining bricks. After balancing completion
+ * the @victim shoudn't contain busy data blocks,
+ * so we have to ignore immobile ststus of files
+ */
+ ret = vol->vol_plug->balance_volume(sb,
+ VBF_MIGRATE_ALL | VBF_CLR_IMMOBILE);
+ if (ret)
+ goto error;
+ reiser4_volume_clear_unbalanced(sb);
+ }
+ /*
+ * at this point volume must have two distribution configs:
+ * old and new ones
+ */
+ assert("edward-2258", vol->new_conf != NULL);
+
+ ret = vol->vol_plug->remove_brick_tail(vol, victim);
+ if (ret)
+ goto error;
+ assert("edward-2471", vol->new_conf == NULL);
+ cleanup:
+ assert("edward-2259", vol->victim == NULL);
+
+ if (victim && !is_meta_brick(victim))
+ /* Goodbye! */
+ reiser4_detach_brick(victim);
+
+ reiser4_volume_clear_incomplete_removal(sb);
+ ret = capture_brick_super(get_meta_subvol());
+ if (ret)
+ goto error;
+ ret = force_commit_current_atom();
+ if (ret)
+ goto error;
+ printk("reiser4 (%s): Removal completed.\n", sb->s_id);
+ return 0;
+ error:
+ reiser4_volume_set_incomplete_removal(sb);
+ warning("", "Failed to complete brick removal on %s.", sb->s_id);
+ return ret;
+}
+
+static int inode_set_immobile(struct inode *inode)
+{
+ if (reiser4_inode_get_flag(inode, REISER4_FILE_IMMOBILE))
+ return 0;
+ if (reserve_update_sd_common(inode))
+ return RETERR(-ENOSPC);
+
+ reiser4_inode_set_flag(inode, REISER4_FILE_IMMOBILE);
+ return reiser4_update_sd(inode);
+}
+
+int inode_clr_immobile(struct inode *inode)
+{
+ if (!reiser4_inode_get_flag(inode, REISER4_FILE_IMMOBILE))
+ return 0;
+ if (reserve_update_sd_common(inode))
+ return RETERR(-ENOSPC);
+
+ reiser4_inode_clr_flag(inode, REISER4_FILE_IMMOBILE);
+ return reiser4_update_sd(inode);
+}
+
+/**
+ * Pre-condition: brick_removal_sem should be down for read
+ */
+static int reiser4_migrate_file(struct file *file, u64 dst_idx)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * We allow file migration on volumes with incompletely removed brick
+ */
+ ret = super_vol_plug(sb)->migrate_file(inode, dst_idx);
+
+ if (ret == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ reiser4_txn_restart_current();
+ grab_space_enable();
+ ret = reiser4_sync_file_common(file, 0, LONG_MAX,
+ 0 /* data and stat data */);
+ if (ret)
+ warning("edward-2463", "failed to sync file %llu",
+ (unsigned long long)get_inode_oid(inode));
+ }
+ return ret;
+}
+
+/**
+ * Reiser4 off-line volume operations (no FS are mounted).
+ * This doesn't spawn transactions and executes not in reiser4 context.
+ */
+int reiser4_offline_op(struct reiser4_vol_op_args *args)
+{
+ int ret;
+
+ switch(args->opcode) {
+ case REISER4_REGISTER_BRICK:
+ ret = reiser4_register_brick(args);
+ break;
+ case REISER4_UNREGISTER_BRICK:
+ ret = reiser4_unregister_brick(args);
+ break;
+ case REISER4_VOLUME_HEADER:
+ ret = reiser4_volume_header(args);
+ break;
+ case REISER4_BRICK_HEADER:
+ ret = reiser4_brick_header(args);
+ break;
+ default:
+ warning("", "Unsupported off-line volume operation %d",
+ args->opcode);
+ ret = -ENOTTY;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * Reiser4 on-line volume operations (on mounted volumes).
+ * Spawn transactions and performed in reiser4 context.
+ */
+int reiser4_volume_op_dir(struct file *file, struct reiser4_vol_op_args *args)
+{
+ int ret;
+ struct super_block *sb = file_inode(file)->i_sb;
+ reiser4_volume *vol = super_volume(sb);
+
+ switch(args->opcode) {
+ case REISER4_PRINT_VOLUME:
+ if (!down_read_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_print_volume(sb, args);
+ up_read(&vol->volume_sem);
+ break;
+ case REISER4_PRINT_BRICK:
+ if (!down_read_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_print_brick(sb, args);
+ up_read(&vol->volume_sem);
+ break;
+ case REISER4_RESIZE_BRICK:
+ if (!down_write_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_resize_brick(sb, args);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_ADD_BRICK:
+ if (!down_write_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_add_brick(sb, args, 0);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_ADD_PROXY:
+ if (!down_write_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_add_brick(sb, args, 1);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_REMOVE_BRICK:
+ if (!down_write_trylock(&vol->volume_sem))
+ goto busy;
+ if (!down_write_trylock(&vol->brick_removal_sem)) {
+ up_write(&vol->volume_sem);
+ goto busy;
+ }
+ ret = reiser4_remove_brick(sb, args);
+ up_write(&vol->brick_removal_sem);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_FINISH_REMOVAL:
+ down_write(&vol->volume_sem);
+ down_write(&vol->brick_removal_sem);
+ ret = reiser4_finish_removal(sb, vol);
+ up_write(&vol->brick_removal_sem);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_SCALE_VOLUME:
+ if (!down_write_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_scale_volume(sb, args);
+ up_write(&vol->volume_sem);
+ break;
+ case REISER4_BALANCE_VOLUME:
+ if (!down_read_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_balance_volume(sb, 0);
+ up_read(&vol->volume_sem);
+ break;
+ case REISER4_RESTORE_REGULAR_DST:
+ if (!down_read_trylock(&vol->volume_sem))
+ goto busy;
+ ret = reiser4_balance_volume(sb,
+ VBF_MIGRATE_ALL | VBF_CLR_IMMOBILE);
+ up_read(&vol->volume_sem);
+ break;
+ default:
+ warning("edward-1950",
+ "%s: volume operation %d is unsupported by directories",
+ sb->s_id, args->opcode);
+ ret = RETERR(-ENOTTY);
+ break;
+ }
+ return ret;
+ busy:
+ warning("", "Operation %d failed: volume %s is busy",
+ args->opcode, sb->s_id);
+ return RETERR(-EBUSY);
+}
+
+int reiser4_volume_op_file(struct file *file, struct reiser4_vol_op_args *args)
+{
+ int ret;
+ struct super_block *sb = file_inode(file)->i_sb;
+ reiser4_volume *vol = super_volume(sb);
+
+ switch(args->opcode) {
+ case REISER4_MIGRATE_FILE:
+ /*
+ * make sure that bricks won't be evicted during file migration
+ */
+ down_read(&vol->brick_removal_sem);
+ ret = reiser4_migrate_file(file, args->s.brick_idx);
+ up_read(&vol->brick_removal_sem);
+ break;
+ case REISER4_SET_FILE_IMMOBILE:
+ ret = inode_set_immobile(file_inode(file));
+ break;
+ case REISER4_CLR_FILE_IMMOBILE:
+ ret = inode_clr_immobile(file_inode(file));
+ break;
+ default:
+ warning("edward-1952",
+ "%s: volume operation %d is unsupported by regular files",
+ sb->s_id, args->opcode);
+ ret = RETERR(-ENOTTY);
+ break;
+ }
+ return ret;
+}
+
+long reiser4_ioctl_volume(struct file *file,
+ unsigned int cmd, unsigned long arg,
+ int (*volume_op)(struct file *file,
+ struct reiser4_vol_op_args *args))
+{
+ int ret;
+ reiser4_context *ctx;
+
+ ctx = reiser4_init_context(file_inode(file)->i_sb);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ switch (cmd) {
+ case REISER4_IOC_VOLUME: {
+ struct reiser4_vol_op_args *op_args;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return RETERR(-EPERM);
+
+ op_args = memdup_user((void __user *)arg, sizeof(*op_args));
+ if (IS_ERR(op_args))
+ return PTR_ERR(op_args);
+
+ ret = volume_op(file, op_args);
+ if (ret) {
+ warning("edward-1899",
+ "On-line volume operation failed (%d)", ret);
+ kfree(op_args);
+ break;
+ }
+ if (copy_to_user((struct reiser4_vol_op_args __user *)arg,
+ op_args, sizeof(*op_args)))
+ ret = RETERR(-EFAULT);
+ kfree(op_args);
+ break;
+ }
+ default:
+ ret = RETERR(-ENOTTY);
+ break;
+ }
+ reiser4_exit_context(ctx);
+ return ret;
+}
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * mode-name: "LC"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/wander.c linux-5.10.2/fs/reiser4/wander.c
--- linux-5.10.2.orig/fs/reiser4/wander.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/wander.c 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,2210 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Reiser4 Wandering Log */
+
+/*
+ * Modified by Edward Shishkin to support Heterogeneous Logical Volumes
+ */
+
+/* You should read http://www.namesys.com/txn-doc.html
+
+ That describes how filesystem operations are performed as atomic
+ transactions, and how we try to arrange it so that we can write most of the
+ data only once while performing the operation atomically.
+
+ For the purposes of this code, it is enough for it to understand that it
+ has been told a given block should be written either once, or twice (if
+ twice then once to the wandered location and once to the real location).
+
+ This code guarantees that those blocks that are defined to be part of an
+ atom either all take effect or none of them take effect.
+
+ The "relocate set" of nodes are submitted to write by the jnode_flush()
+ routine, and the "overwrite set" is submitted by reiser4_write_log().
+ This is because with the overwrite set we seek to optimize writes, and
+ with the relocate set we seek to cause disk order to correlate with the
+ "parent first order" (preorder).
+
+ reiser4_write_log() allocates and writes wandered blocks and maintains
+ additional on-disk structures of the atom as wander records (each wander
+ record occupies one block) for storing of the "wandered map" (a table which
+ contains a relation between wandered and real block numbers) and other
+ information which might be needed at transaction recovery time.
+
+ The wander records are unidirectionally linked into a circle: each wander
+ record contains a block number of the next wander record, the last wander
+ record points to the first one.
+
+ One wander record (named "tx head" in this file) has a format which is
+ different from the other wander records. The "tx head" has a reference to the
+ "tx head" block of the previously committed atom. Also, "tx head" contains
+ fs information (the free blocks counter, and the oid allocator state) which
+ is logged in a special way .
+
+ There are two journal control blocks, named journal header and journal
+ footer which have fixed on-disk locations. The journal header has a
+ reference to the "tx head" block of the last committed atom. The journal
+ footer points to the "tx head" of the last flushed atom. The atom is
+ "played" when all blocks from its overwrite set are written to disk the
+ second time (i.e. written to their real locations).
+
+ NOTE: People who know reiserfs internals and its journal structure might be
+ confused with these terms journal footer and journal header. There is a table
+ with terms of similar semantics in reiserfs (reiser3) and reiser4:
+
+ REISER3 TERM | REISER4 TERM | DESCRIPTION
+ --------------------+-----------------------+----------------------------
+ commit record | journal header | atomic write of this record
+ | | ends transaction commit
+ --------------------+-----------------------+----------------------------
+ journal header | journal footer | atomic write of this record
+ | | ends post-commit writes.
+ | | After successful
+ | | writing of this journal
+ | | blocks (in reiser3) or
+ | | wandered blocks/records are
+ | | free for re-use.
+ --------------------+-----------------------+----------------------------
+
+ The atom commit process is the following:
+
+ 1. The overwrite set is taken from atom's clean list, and its size is
+ counted.
+
+ 2. The number of necessary wander records (including tx head) is calculated,
+ and the wander record blocks are allocated.
+
+ 3. Allocate wandered blocks and populate wander records by wandered map.
+
+ 4. submit write requests for wander records and wandered blocks.
+
+ 5. wait until submitted write requests complete.
+
+ 6. update journal header: change the pointer to the block number of just
+ written tx head, submit an i/o for modified journal header block and wait
+ for i/o completion.
+
+ NOTE: The special logging for bitmap blocks and some reiser4 super block
+ fields makes processes of atom commit, flush and recovering a bit more
+ complex (see comments in the source code for details).
+
+ The atom playing process is the following:
+
+ 1. Write atom's overwrite set in-place.
+
+ 2. Wait on i/o.
+
+ 3. Update journal footer: change the pointer to block number of tx head
+ block of the atom we currently flushing, submit an i/o, wait on i/o
+ completion.
+
+ 4. Free disk space which was used for wandered blocks and wander records.
+
+ After the freeing of wandered blocks and wander records we have that journal
+ footer points to the on-disk structure which might be overwritten soon.
+ Neither the log writer nor the journal recovery procedure use that pointer
+ for accessing the data. When the journal recovery procedure finds the oldest
+ transaction it compares the journal footer pointer value with the "prev_tx"
+ pointer value in tx head, if values are equal the oldest not flushed
+ transaction is found.
+
+ NOTE on disk space leakage: the information about of what blocks and how many
+ blocks are allocated for wandered blocks, wandered records is not written to
+ the disk because of special logging for bitmaps and some super blocks
+ counters. After a system crash we the reiser4 does not remember those
+ objects allocation, thus we have no such a kind of disk space leakage.
+*/
+
+/* Special logging of reiser4 super block fields. */
+
+/* There are some reiser4 super block fields (free block count and OID allocator
+ state (number of files and next free OID) which are logged separately from
+ super block to avoid unnecessary atom fusion.
+
+ So, the reiser4 super block can be not captured by a transaction with
+ allocates/deallocates disk blocks or create/delete file objects. Moreover,
+ the reiser4 on-disk super block is not touched when such a transaction is
+ committed and flushed. Those "counters logged specially" are logged in "tx
+ head" blocks and in the journal footer block.
+
+ A step-by-step description of special logging:
+
+ 0. The per-atom information about deleted or created files and allocated or
+ freed blocks is collected during the transaction. The atom's
+ ->nr_objects_created and ->nr_objects_deleted are for object
+ deletion/creation tracking, the numbers of allocated and freed blocks are
+ calculated using atom's delete set and atom's capture list -- all new and
+ relocated nodes should be on atom's clean list and should have JNODE_RELOC
+ bit set.
+
+ 1. The "logged specially" reiser4 super block fields have their "committed"
+ versions in the reiser4 in-memory super block. They get modified only at
+ atom commit time. The atom's commit thread has an exclusive access to those
+ "committed" fields because the log writer implementation supports only one
+ atom commit a time (there is a per-fs "commit" mutex). At
+ that time "committed" counters are modified using per-atom information
+ collected during the transaction. These counters are stored on disk as a
+ part of tx head block when atom is committed.
+
+ 2. When the atom is flushed the value of the free block counter and the OID
+ allocator state get written to the journal footer block. A special journal
+ procedure (journal_recover_sb_data()) takes those values from the journal
+ footer and updates the reiser4 in-memory super block.
+
+ NOTE: That means free block count and OID allocator state are logged
+ separately from the reiser4 super block regardless of the fact that the
+ reiser4 super block has fields to store both the free block counter and the
+ OID allocator.
+
+ Writing the whole super block at commit time requires knowing true values of
+ all its fields without changes made by not yet committed transactions. It is
+ possible by having their "committed" version of the super block like the
+ reiser4 bitmap blocks have "committed" and "working" versions. However,
+ another scheme was implemented which stores special logged values in the
+ unused free space inside transaction head block. In my opinion it has an
+ advantage of not writing whole super block when only part of it was
+ modified. */
+
+#include "debug.h"
+#include "dformat.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "page_cache.h"
+#include "wander.h"
+#include "reiser4.h"
+#include "super.h"
+#include "vfs_ops.h"
+#include "writeout.h"
+#include "inode.h"
+#include "entd.h"
+#include "plugin/volume/volume.h"
+
+#include <linux/types.h>
+#include <linux/fs.h> /* for struct super_block */
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+#include <linux/bio.h> /* for struct bio */
+#include <linux/blkdev.h>
+
+static int write_jnodes_contig(jnode *, int, const reiser4_block_nr *,
+ flush_queue_t *, int, reiser4_subvol *);
+/*
+ * Per-logical-volume commit_handle.
+ * This contains infrastructure needed at atom commit time.
+ * See also definition of per-suvbolume commit handle (commit_handle_subvol)
+ */
+struct commit_handle {
+ __u64 nr_files;
+ __u64 next_oid;
+ __u32 total_tx_size; /* total number of wander records */
+ __u32 total_overwrite_set_size;
+ reiser4_block_nr total_nr_bitmap;
+ txn_atom *atom; /* the atom which is being committed */
+ struct super_block *super; /* current super block */
+};
+
+static void init_ch_sub(reiser4_subvol *subv)
+{
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+
+ assert("edward-1700", list_empty(&ch_sub->overwrite_set));
+ assert("edward-1701", list_empty(&ch_sub->tx_list));
+ assert("edward-1702", list_empty(&ch_sub->wander_map));
+
+ __init_ch_sub(ch_sub);
+ ch_sub->free_blocks = subv->blocks_free_committed;
+}
+
+static void init_commit_handle(struct commit_handle *ch, txn_atom *atom,
+ reiser4_subvol *subv)
+{
+ memset(ch, 0, sizeof(struct commit_handle));
+ ch->atom = atom;
+ ch->super = reiser4_get_current_sb();
+ ch->nr_files = get_current_super_private()->nr_files_committed;
+ ch->next_oid = oid_next(ch->super);
+ if (subv)
+ /*
+ * init ch of specified subvolume
+ */
+ init_ch_sub(subv);
+ else {
+ /*
+ * init ch of all subvolumes
+ */
+ struct rb_node *node;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ abi = rb_entry(node, struct atom_brick_info, node);
+
+ init_ch_sub(super_origin(ch->super, abi->brick_id));
+ }
+ }
+}
+
+#if REISER4_DEBUG
+static void done_ch_sub(reiser4_subvol *subv)
+{
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+
+ assert("edward-1703", list_empty(&ch_sub->overwrite_set));
+ assert("edward-1704", list_empty(&ch_sub->tx_list));
+ assert("edward-1705", list_empty(&ch_sub->wander_map));
+}
+#endif
+
+static void done_commit_handle(struct commit_handle *ch, reiser4_subvol *subv)
+{
+#if REISER4_DEBUG
+ if (subv)
+ done_ch_sub(subv);
+ else {
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+ struct atom_brick_info *abi;
+ abi = rb_entry(node, struct atom_brick_info, node);
+
+ done_ch_sub(super_origin(ch->super, abi->brick_id));
+ }
+ }
+#endif
+}
+
+/* fill journal header block data */
+static void format_journal_header(struct commit_handle *ch,
+ unsigned subv_id)
+{
+ reiser4_subvol *subv;
+ struct journal_header *header;
+ jnode *txhead;
+
+ subv = super_origin(ch->super, subv_id);
+ assert("zam-480", subv->journal_header != NULL);
+
+ txhead = list_entry(subv->ch.tx_list.next, jnode, capture_link);
+
+ jload(subv->journal_header);
+
+ header = (struct journal_header *)jdata(subv->journal_header);
+ assert("zam-484", header != NULL);
+
+ put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
+ &header->last_committed_tx);
+
+ jrelse(subv->journal_header);
+}
+
+/* fill journal footer block data */
+static void format_journal_footer(struct commit_handle *ch,
+ reiser4_subvol *subv)
+{
+ struct journal_footer *footer;
+ jnode *tx_head;
+ struct commit_handle_subvol *ch_sub;
+
+ ch_sub = &subv->ch;
+
+ tx_head = list_entry(ch_sub->tx_list.next, jnode, capture_link);
+
+ assert("zam-494", subv->journal_header != NULL);
+
+ check_me("zam-691", jload(subv->journal_footer) == 0);
+
+ footer = (struct journal_footer *)jdata(subv->journal_footer);
+ assert("zam-495", footer != NULL);
+
+ put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
+ &footer->last_flushed_tx);
+ put_unaligned(cpu_to_le64(ch_sub->free_blocks), &footer->free_blocks);
+
+ put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
+ put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
+
+ jrelse(subv->journal_footer);
+}
+
+/* wander record capacity depends on current block size */
+static int wander_record_capacity(const struct super_block *super)
+{
+ return (super->s_blocksize -
+ sizeof(struct wander_record_header)) /
+ sizeof(struct wander_entry);
+}
+
+/*
+ * Fill first wander record (tx head) in accordance with supplied given data
+ */
+static void format_tx_head(struct commit_handle *ch, unsigned subv_id)
+{
+ jnode *tx_head;
+ jnode *next;
+ struct tx_header *header;
+ struct commit_handle_subvol *ch_sub;
+ reiser4_subvol *subv;
+
+ subv = super_origin(ch->super, subv_id);
+ ch_sub = &subv->ch;
+
+ tx_head = list_entry(ch_sub->tx_list.next, jnode, capture_link);
+ assert("zam-692", &ch_sub->tx_list != &tx_head->capture_link);
+
+ next = list_entry(tx_head->capture_link.next, jnode, capture_link);
+ if (&ch_sub->tx_list == &next->capture_link)
+ next = tx_head;
+
+ header = (struct tx_header *)jdata(tx_head);
+
+ assert("zam-460", header != NULL);
+ assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
+
+ memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
+ memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
+
+ put_unaligned(cpu_to_le32(ch_sub->tx_size), &header->total);
+ put_unaligned(cpu_to_le64(subv->last_committed_tx), &header->prev_tx);
+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
+ put_unaligned(cpu_to_le64(ch_sub->free_blocks), &header->free_blocks);
+ put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
+ put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
+}
+
+/*
+ * prepare ordinary wander record block (fill all service fields)
+ */
+static void format_wander_record(struct commit_handle *ch, unsigned subv_id,
+ jnode *node, __u32 serial)
+{
+ jnode *next;
+ struct wander_record_header *LRH;
+ struct commit_handle_subvol *ch_sub;
+
+ assert("zam-464", node != NULL);
+
+ ch_sub = &super_origin(ch->super, subv_id)->ch;
+
+ LRH = (struct wander_record_header *)jdata(node);
+ next = list_entry(node->capture_link.next, jnode, capture_link);
+
+ if (&ch_sub->tx_list == &next->capture_link)
+ next = list_entry(ch_sub->tx_list.next, jnode, capture_link);
+
+ assert("zam-465", LRH != NULL);
+ assert("zam-463",
+ ch->super->s_blocksize > sizeof(struct wander_record_header));
+
+ memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
+ memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
+
+ put_unaligned(cpu_to_le32(ch_sub->tx_size), &LRH->total);
+ put_unaligned(cpu_to_le32(serial), &LRH->serial);
+ put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
+}
+
+/**
+ * add one wandered map entry to formatted wander record
+ */
+static void store_entry(jnode *node, int index,
+ const reiser4_block_nr *a, const reiser4_block_nr *b)
+{
+ char *data;
+ struct wander_entry *pairs;
+
+ data = jdata(node);
+ assert("zam-451", data != NULL);
+
+ pairs =
+ (struct wander_entry *)(data + sizeof(struct wander_record_header));
+
+ put_unaligned(cpu_to_le64(*a), &pairs[index].original);
+ put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
+}
+
+/*
+ * currently, wander record contains only wandered map,
+ * which depends on overwrite set size
+ */
+static void get_tx_size(struct commit_handle *ch)
+{
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &super_origin(ch->super, abi->brick_id)->ch;
+
+ assert("zam-695", ch_sub->tx_size == 0);
+
+ if (ch_sub->overwrite_set_size == 0)
+ continue;
+ /*
+ * count all ordinary wander records
+ * (<overwrite_set_size> - 1) / <wander_record_capacity> + 1
+ * and add one for tx head block
+ */
+ ch_sub->tx_size =
+ (ch_sub->overwrite_set_size - 1)/
+ wander_record_capacity(ch->super) + 2;
+ ch->total_tx_size += ch_sub->tx_size;
+ }
+}
+
+/*
+ * A special structure for using in store_wmap_actor()
+ * for saving its state between calls
+ */
+struct store_wmap_params {
+ jnode *cur; /* jnode of current wander record to fill */
+ int idx; /* free element index in wander record */
+ int capacity; /* capacity */
+#if REISER4_DEBUG
+ struct list_head *tx_list;
+#endif
+};
+
+/*
+ * an actor for use in blocknr_set_iterator routine
+ * which populates the list of pre-formatted wander
+ * records by wandered map info
+ */
+static int store_wmap_actor(txn_atom *atom UNUSED_ARG,
+ const reiser4_block_nr *a,
+ const reiser4_block_nr *b,
+ __u32 subv_id, void *data)
+{
+ struct store_wmap_params *params = data;
+
+ if (params->idx >= params->capacity) {
+ /*
+ * a new wander record should be taken from the tx_list
+ */
+ params->cur = list_entry(params->cur->capture_link.next,
+ jnode, capture_link);
+ assert("zam-454",
+ params->tx_list != &params->cur->capture_link);
+
+ params->idx = 0;
+ }
+ store_entry(params->cur, params->idx, a, b);
+ params->idx++;
+
+ return 0;
+}
+
+/**
+ * This function is called after Relocate set gets written to disk, Overwrite
+ * set is written to wandered locations and all wander records are written
+ * also. Updated journal header blocks contains a pointer (block number) to
+ * first wander record of the just written transaction
+ */
+static int update_journal_header(struct commit_handle *ch, u32 subv_id)
+{
+ int ret;
+ reiser4_subvol *subv = super_origin(ch->super, subv_id);
+ jnode *jh = subv->journal_header;
+ jnode *head = list_entry(subv->ch.tx_list.next, jnode, capture_link);
+
+ format_journal_header(ch, subv_id);
+
+ ret = write_jnodes_contig(jh, 1, jnode_get_block(jh), NULL,
+ WRITEOUT_FLUSH_FUA, subv);
+ if (ret)
+ return ret;
+
+ ret = jwait_io(jh, WRITE);
+ if (ret)
+ return ret;
+
+ subv->last_committed_tx = *jnode_get_block(head);
+ return 0;
+}
+
+/**
+ * This function is called after write-back is finished. We update journal
+ * footer block and free blocks which were occupied by wandered blocks and
+ * transaction wander records
+ */
+static int update_journal_footer(struct commit_handle *ch, reiser4_subvol *subv)
+{
+ int ret;
+ jnode *jf = subv->journal_footer;
+
+ format_journal_footer(ch, subv);
+
+ ret = write_jnodes_contig(jf, 1, jnode_get_block(jf), NULL,
+ WRITEOUT_FLUSH_FUA, subv);
+ if (ret)
+ return ret;
+
+ ret = jwait_io(jf, WRITE);
+ if (ret)
+ return ret;
+ return 0;
+}
+
+/*
+ * free block numbers of wander records of already written in place transaction
+ */
+static void dealloc_tx_list(struct commit_handle *ch)
+{
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ reiser4_subvol *subv;
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ subv = current_origin(abi->brick_id);
+ ch_sub = &subv->ch;
+
+ while (!list_empty(&ch_sub->tx_list)) {
+ jnode *cur = list_entry(ch_sub->tx_list.next,
+ jnode,
+ capture_link);
+
+ list_del(&cur->capture_link);
+ ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
+ reiser4_dealloc_block(jnode_get_block(cur), 0,
+ BA_DEFER | BA_FORMATTED, subv);
+ unpin_jnode_data(cur);
+ reiser4_drop_io_head(cur);
+ }
+ }
+}
+
+/*
+ * An actor for use in block_nr_iterator() routine which frees wandered blocks
+ * from atom's overwrite set
+ */
+static int dealloc_wmap_actor(txn_atom *atom UNUSED_ARG,
+ const reiser4_block_nr *a UNUSED_ARG,
+ const reiser4_block_nr *b, __u32 subv_id,
+ void *data UNUSED_ARG)
+{
+ assert("zam-499", b != NULL);
+ assert("zam-500", *b != 0);
+ assert("zam-501", !reiser4_blocknr_is_fake(b));
+
+ reiser4_dealloc_block(b, 0, BA_DEFER | BA_FORMATTED,
+ current_origin(subv_id));
+ return 0;
+}
+
+/**
+ * Free wandered block locations.
+ * Pre-condition: Transaction has been played (that is, all blocks
+ * from the OVERWRITE set were overwritten successfully.
+ */
+static void dealloc_wmap(struct commit_handle *ch)
+{
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ assert("zam-696", ch->atom != NULL);
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &super_origin(ch->super, abi->brick_id)->ch;
+
+ blocknr_set_iterator(atom,
+ &ch_sub->wander_map,
+ dealloc_wmap_actor, NULL, 1,
+ abi->brick_id);
+ }
+}
+
+static int alloc_wander_blocks(int count, reiser4_block_nr *start, int *len,
+ reiser4_subvol *subv)
+{
+ int ret;
+ reiser4_blocknr_hint hint;
+ reiser4_block_nr wide_len = count;
+
+ /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
+ ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
+ reserved allocation area so as to get the best qualities of fixed
+ journals? */
+ reiser4_blocknr_hint_init(&hint);
+ hint.block_stage = BLOCK_GRABBED;
+
+ ret = reiser4_alloc_blocks(&hint, start, &wide_len,
+ BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START,
+ subv);
+ *len = (int)wide_len;
+ return ret;
+}
+
+/*
+ * roll back changes made before issuing BIO in the case of IO error.
+ */
+static void undo_bio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *pg;
+ jnode *node;
+
+ pg = bvec->bv_page;
+ end_page_writeback(pg);
+ node = jprivate(pg);
+ spin_lock_jnode(node);
+ JF_CLR(node, JNODE_WRITEBACK);
+ JF_SET(node, JNODE_DIRTY);
+ spin_unlock_jnode(node);
+ }
+ bio_put(bio);
+}
+
+/**
+ * release resources aquired in get_overwrite_set()
+ */
+static void put_overwrite_set(struct commit_handle *ch)
+{
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ jnode *cur;
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &super_origin(ch->super, abi->brick_id)->ch;
+
+ list_for_each_entry(cur, &ch_sub->overwrite_set, capture_link)
+ jrelse_tail(cur);
+ reiser4_invalidate_list(&ch_sub->overwrite_set);
+ }
+}
+
+void check_overwrite_set_subv(reiser4_subvol *subv)
+{
+ jnode *cur;
+
+ list_for_each_entry(cur, &subv->ch.overwrite_set, capture_link)
+ assert("edward-1706", cur->subvol == subv);
+}
+
+void check_overwrite_set(txn_atom *atom)
+{
+ struct rb_node *node;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ abi = rb_entry(node, struct atom_brick_info, node);
+
+ check_overwrite_set_subv(current_origin(abi->brick_id));
+ }
+}
+
+/*
+ * Scan atom't overwrite set and do the following:
+ * . move every jnode to overwrite set of respective subvolume;
+ * . count total number of nodes in all overwrite sets;
+ * . grab disk space for wandered blocks allocation;
+ * . count bitmap and other not leaf nodes which wandered blocks
+ * allocation we have to grab space for.
+ */
+int get_overwrite_set(struct commit_handle *ch)
+{
+ int ret;
+ jnode *cur;
+ struct list_head *overw_set;
+ s64 rest_flush_reserved;
+#if REISER4_DEBUG
+ u64 nr_formatted_leaves = 0;
+ u64 nr_unformatted_leaves = 0;
+#endif
+ overw_set = ATOM_OVRWR_LIST(ch->atom);
+ cur = list_entry(overw_set->next, jnode, capture_link);
+
+ while (!list_empty(overw_set)) {
+ jnode *next;
+ struct reiser4_subvol *subv;
+ struct commit_handle_subvol *ch_sub;
+ struct list_head *subv_overw_set;
+
+ next = list_entry(cur->capture_link.next, jnode, capture_link);
+ subv = cur->subvol;
+ ch_sub = &subv->ch;
+ subv_overw_set = &ch_sub->overwrite_set;
+ /*
+ * Count bitmap blocks for getting correct statistics what
+ * number of blocks were cleared by the transaction commit
+ */
+ if (jnode_get_type(cur) == JNODE_BITMAP) {
+ ch_sub->nr_bitmap++;
+ ch->total_nr_bitmap++;
+ }
+ assert("zam-939", JF_ISSET(cur, JNODE_OVRWR) ||
+ jnode_get_type(cur) == JNODE_BITMAP);
+
+ if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
+ /*
+ * This is a super-block captured in rare events (like
+ * the final commit at the end of mount session (see
+ * release_format40()->capture_brick_super(), also
+ * see comments at reiser4_journal_recover_sb_data()).
+ *
+ * We replace fake znode by another (real) znode which
+ * is suggested by disk_layout plugin
+ */
+ struct super_block *s = reiser4_get_current_sb();
+
+ if (subv->df_plug->log_super) {
+ jnode *sj;
+
+ sj = subv->df_plug->log_super(s, subv);
+ assert("zam-593", sj != NULL);
+
+ if (IS_ERR(sj))
+ return PTR_ERR(sj);
+
+ spin_lock_jnode(sj);
+ JF_SET(sj, JNODE_OVRWR);
+ /*
+ * put the new jnode right to overwrite
+ * set of respective subvolume
+ */
+ insert_into_subv_ovrwr_list(subv, sj, ch->atom);
+ spin_unlock_jnode(sj);
+ jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
+
+ ch_sub->overwrite_set_size++;
+ ch->total_overwrite_set_size++;
+ }
+ spin_lock_jnode(cur);
+ reiser4_uncapture_block(cur);
+ jput(cur);
+
+ } else {
+ int ret;
+ ch_sub->overwrite_set_size++;
+ ch->total_overwrite_set_size++;
+ /*
+ * move jnode to the overwrite list of
+ * respective subvolume
+ */
+ list_move(&cur->capture_link, subv_overw_set);
+ ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
+ if (ret)
+ reiser4_panic("zam-783",
+ "cannot load jnode (ret = %d)\n",
+ ret);
+ }
+ /*
+ * Count not leaves here because we have to grab disk space
+ * for wandered blocks. They were not counted as "flush
+ * reserved". Counting should be done _after_ nodes are pinned
+ * into memory by jload().
+ */
+ if (!jnode_is_leaf(cur)) {
+ /*
+ * Grab space for writing (wandered blocks)
+ * of not leaves found in overwrite set
+ */
+ ret = reiser4_grab_space_force(1, BA_RESERVED,
+ jnode_get_subvol(cur));
+ if (ret)
+ return ret;
+ }
+ else {
+#if REISER4_DEBUG
+ if (jnode_is_znode(cur))
+ nr_formatted_leaves++;
+ else
+ nr_unformatted_leaves++;
+#endif
+ JF_CLR(cur, JNODE_FLUSH_RESERVED);
+ }
+ cur = next;
+ }
+ /*
+ * All needed disk space reserved for allocation of wandered
+ * blocks of leaf nodes ("flush reserved") has changed its status
+ * to "used". Return the rest to "grabbed" to be released later.
+ */
+ rest_flush_reserved = all_flush_reserved2grabbed(ch->atom);
+ if (rest_flush_reserved < 0)
+ return rest_flush_reserved;
+
+ assert("zam-940",
+ nr_formatted_leaves + nr_unformatted_leaves <=
+ rest_flush_reserved);
+
+ check_overwrite_set(ch->atom);
+ return ch->total_overwrite_set_size;
+}
+
+/**
+ * write_jnodes_contig - submit write request.
+ * @head:
+ * @first: first jnode of the list
+ * @nr: number of jnodes on the list
+ * @block_p:
+ * @fq:
+ * @flags: used to decide whether page is to get PG_reclaim flag
+ *
+ * Submits a write request for @nr jnodes beginning from the @first, other
+ * jnodes are after the @first on the double-linked "capture" list. All jnodes
+ * will be written to the disk region of @nr blocks starting with @block_p block
+ * number. If @fq is not NULL it means that waiting for i/o completion will be
+ * done more efficiently by using flush_queue_t objects.
+ * This function is the one which writes list of jnodes in batch mode. It does
+ * all low-level things as bio construction and page states manipulation.
+ *
+ * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
+ * aggregated in this function instead of being left to the layers below
+ *
+ * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
+ * Why that layer needed? Why BIOs cannot be constructed here?
+ */
+static int write_jnodes_contig(jnode *first, int nr,
+ const reiser4_block_nr *block_p,
+ flush_queue_t *fq, int flags,
+ reiser4_subvol *subv)
+{
+ struct super_block *super = reiser4_get_current_sb();
+ int op_flags = (flags & WRITEOUT_FLUSH_FUA) ? REQ_PREFLUSH | REQ_FUA : 0;
+ jnode *cur = first;
+ reiser4_block_nr block;
+
+ assert("zam-571", first != NULL);
+ assert("zam-572", block_p != NULL);
+ assert("zam-570", nr > 0);
+
+ if (subv == NULL)
+ subv = first->subvol;
+ block = *block_p;
+
+ while (nr > 0) {
+ struct bio *bio;
+ int nr_blocks = min(nr, BIO_MAX_PAGES);
+ int i;
+ int nr_used;
+
+ bio = bio_alloc(GFP_NOIO, nr_blocks);
+ if (!bio)
+ return RETERR(-ENOMEM);
+
+ bio_set_dev(bio, subv->bdev);
+ bio->bi_iter.bi_sector = block * (super->s_blocksize >> 9);
+ for (nr_used = 0, i = 0; i < nr_blocks; i++) {
+ struct page *pg;
+
+ pg = jnode_page(cur);
+ assert("zam-573", pg != NULL);
+
+ get_page(pg);
+
+ lock_and_wait_page_writeback(pg);
+
+ if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
+ /*
+ * underlying device is satiated. Stop adding
+ * pages to the bio.
+ */
+ unlock_page(pg);
+ put_page(pg);
+ break;
+ }
+
+ spin_lock_jnode(cur);
+ assert("nikita-3166",
+ pg->mapping == jnode_get_mapping(cur));
+ assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
+#if REISER4_DEBUG
+ spin_lock(&cur->load);
+ assert("nikita-3165",
+ ergo(is_origin(subv), !jnode_is_releasable(cur)));
+ spin_unlock(&cur->load);
+#endif
+ JF_SET(cur, JNODE_WRITEBACK);
+ JF_CLR(cur, JNODE_DIRTY);
+ ON_DEBUG(cur->written++);
+
+ assert("edward-1647",
+ ergo(jnode_is_znode(cur), JF_ISSET(cur, JNODE_PARSED)));
+ spin_unlock_jnode(cur);
+ /*
+ * update checksum
+ */
+ if (jnode_is_znode(cur) && is_origin(subv)) {
+ zload(JZNODE(cur));
+ if (node_plugin_by_node(JZNODE(cur))->csum)
+ node_plugin_by_node(JZNODE(cur))->csum(JZNODE(cur), 0);
+ zrelse(JZNODE(cur));
+ }
+ ClearPageError(pg);
+ set_page_writeback(pg);
+
+ if (get_current_context()->entd) {
+ /* this is ent thread */
+ entd_context *ent = get_entd_context(super);
+ struct wbq *rq, *next;
+
+ spin_lock(&ent->guard);
+
+ if (pg == ent->cur_request->page) {
+ /*
+ * entd is called for this page. This
+ * request is not in th etodo list
+ */
+ ent->cur_request->written = 1;
+ } else {
+ /*
+ * if we have written a page for which writepage
+ * is called for - move request to another list.
+ */
+ list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
+ assert("", rq->magic == WBQ_MAGIC);
+ if (pg == rq->page) {
+ /*
+ * remove request from
+ * entd's queue, but do
+ * not wake up a thread
+ * which put this
+ * request
+ */
+ list_del_init(&rq->link);
+ ent->nr_todo_reqs --;
+ list_add_tail(&rq->link, &ent->done_list);
+ ent->nr_done_reqs ++;
+ rq->written = 1;
+ break;
+ }
+ }
+ }
+ spin_unlock(&ent->guard);
+ }
+
+ clear_page_dirty_for_io(pg);
+
+ unlock_page(pg);
+
+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
+ nr_used++;
+ }
+ if (nr_used > 0) {
+ assert("nikita-3453",
+ bio->bi_iter.bi_size == super->s_blocksize * nr_used);
+
+ /* Check if we are allowed to write at all */
+ if (sb_rdonly(super))
+ undo_bio(bio);
+ else {
+ add_fq_to_bio(fq, bio);
+ bio_get(bio);
+ bio_set_op_attrs(bio, WRITE, op_flags);
+ submit_bio(bio);
+ bio_put(bio);
+ }
+
+ block += nr_used - 1;
+ if (is_origin(subv))
+ update_blocknr_hint_default(super, subv, &block);
+ block += 1;
+ } else {
+ bio_put(bio);
+ }
+ nr -= nr_used;
+ }
+
+ return 0;
+}
+
+/**
+ * Submit a list of jnodes against specified subvolume @subv (it can be
+ * original subvolume, or replica).
+ * This procedure recovers extents (contiguous sequences of disk block
+ * numbers) in a given list of jnodes and submits write requests on this
+ * per-extent basis.
+ *
+ * @head: the list of jnodes to submit
+ */
+int write_jnode_list_subv(struct list_head *head, flush_queue_t *fq,
+ long *nr_submitted, int flags, reiser4_subvol *subv)
+{
+ int ret;
+ struct list_head *beg = head->next;
+
+ while (head != beg) {
+ int nr = 1;
+ struct list_head *cur = beg->next;
+
+ while (head != cur) {
+ assert("edward-1707",
+ jnode_by_link(beg)->subvol ==
+ jnode_by_link(cur)->subvol);
+
+ if (*jnode_get_block(jnode_by_link(cur)) !=
+ *jnode_get_block(jnode_by_link(beg)) + nr)
+ break;
+ ++nr;
+ cur = cur->next;
+ }
+ ret = write_jnodes_contig(jnode_by_link(beg), nr,
+ jnode_get_block(jnode_by_link(beg)),
+ fq, flags, subv);
+ if (ret)
+ return ret;
+ if (nr_submitted)
+ *nr_submitted += nr;
+ beg = cur;
+ }
+ return 0;
+}
+
+/**
+ * Submit a list of jnodes.
+ * Every jnode is submitted against an origimal subvolume and all its
+ * replicas.
+ * This procedure recovers extents (contiguous sequences of disk block
+ * numbers) in a given list of jnodes and submits write requests on this
+ * per-extent basis.
+ *
+ * @head: list of jnodes to submit.
+ */
+int write_jnode_list(struct list_head *head, flush_queue_t *fq,
+ long *nr_submitted, int flags)
+{
+ int ret;
+ struct list_head *beg = head->next;
+
+ while (head != beg) {
+ int nr = 1;
+ u32 mirr_id;
+ struct list_head *cur = beg->next;
+ reiser4_subvol *subv = jnode_get_subvol(jnode_by_link(beg));
+
+ while (head != cur) {
+ if (jnode_get_subvol(jnode_by_link(cur)) !=
+ jnode_get_subvol(jnode_by_link(beg)))
+ break;
+ if (*jnode_get_block(jnode_by_link(cur)) !=
+ *jnode_get_block(jnode_by_link(beg)) + nr)
+ break;
+ ++nr;
+ cur = cur->next;
+ }
+ /*
+ * submit recovered extent against original subvolume
+ * and all its replicas
+ */
+ for_each_mirror(subv->id, mirr_id) {
+ reiser4_subvol *mirror;
+
+ mirror = current_mirror(subv->id, mirr_id);
+
+ ret = write_jnodes_contig(jnode_by_link(beg), nr,
+ jnode_get_block(jnode_by_link(beg)),
+ fq, flags, mirror);
+ if (ret)
+ return ret;
+ }
+#if 0
+ notice("edward-1875",
+ "subvol %llu: written extent (%llu, %llu)",
+ (unsigned long long)subv->id,
+ (unsigned long long)*jnode_get_block(jnode_by_link(beg)),
+ (unsigned long long)nr);
+#endif
+ if (nr_submitted)
+ *nr_submitted += nr;
+ beg = cur;
+ }
+ return 0;
+}
+
+/*
+ * add given wandered mapping to atom's wandered map
+ */
+static int add_region_to_wmap(jnode *cur, int len,
+ const reiser4_block_nr *block_p,
+ reiser4_subvol *subv)
+{
+ int ret;
+ blocknr_set_entry *new_bsep = NULL;
+ reiser4_block_nr block;
+
+ txn_atom *atom;
+
+ assert("zam-568", block_p != NULL);
+ block = *block_p;
+ assert("zam-569", len > 0);
+
+ while ((len--) > 0) {
+ do {
+ atom = get_current_atom_locked();
+ assert("zam-536",
+ !reiser4_blocknr_is_fake(jnode_get_block(cur)));
+
+ ret = blocknr_set_add_pair(atom,
+ &subv->ch.wander_map,
+ &new_bsep,
+ jnode_get_block(cur),
+ &block, subv->id);
+ } while (ret == -E_REPEAT);
+
+ if (ret) {
+ /*
+ * deallocate blocks which were not added
+ * to wandered map
+ */
+ reiser4_block_nr wide_len = len;
+
+ reiser4_dealloc_blocks(&block, &wide_len,
+ BLOCK_NOT_COUNTED,
+ BA_FORMATTED, /* formatted,
+ without defer */
+ subv);
+ return ret;
+ }
+ spin_unlock_atom(atom);
+
+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
+ ++block;
+ }
+ return 0;
+}
+
+/**
+ * Allocate temporal ("wandering") disk addresses for specified OVERWRITE set
+ * and immediately submit IOs for them.
+ * We assume that current atom is in a stage when any atom fusion is impossible
+ * and atom is unlocked and it is safe.
+ */
+static int alloc_submit_wander_blocks(struct commit_handle *ch,
+ unsigned subv_id, flush_queue_t *fq)
+{
+ reiser4_block_nr block;
+ int rest;
+ int len;
+ int ret;
+ jnode *cur;
+ reiser4_subvol *subv = super_origin(ch->super, subv_id);
+ struct list_head *overw_set = &subv->ch.overwrite_set;
+
+ rest = subv->ch.overwrite_set_size;
+
+ assert("zam-534", rest > 0);
+
+ cur = list_entry(overw_set->next, jnode, capture_link);
+
+ while (overw_set != &cur->capture_link) {
+ assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
+
+ ret = alloc_wander_blocks(rest, &block, &len, subv);
+ if (ret)
+ return ret;
+
+ rest -= len;
+
+ ret = add_region_to_wmap(cur, len, &block, subv);
+ if (ret)
+ return ret;
+
+ ret = write_jnodes_contig(cur, len, &block, fq, 0, subv);
+ if (ret)
+ return ret;
+
+ while ((len--) > 0) {
+ assert("zam-604", overw_set != &cur->capture_link);
+ cur = list_entry(cur->capture_link.next,
+ jnode, capture_link);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Allocate given number of nodes over the journal area and link
+ * them into a list; return pointer to the first jnode in the list
+ */
+static int alloc_submit_wander_records(struct commit_handle *ch,
+ unsigned subv_id, flush_queue_t *fq)
+{
+ reiser4_blocknr_hint hint;
+ reiser4_block_nr allocated = 0;
+ reiser4_block_nr first, len;
+ jnode *cur;
+ jnode *txhead;
+ int ret;
+ reiser4_context *ctx;
+ reiser4_subvol *subv = super_origin(ch->super, subv_id);
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+ struct list_head *tx_list = &ch_sub->tx_list;
+ int tx_size = ch_sub->tx_size;
+
+ assert("zam-698", tx_size > 0);
+ assert("zam-699", list_empty_careful(tx_list));
+
+ ctx = get_current_context();
+
+ while (allocated < (unsigned)tx_size) {
+ len = tx_size - allocated;
+
+ reiser4_blocknr_hint_init(&hint);
+
+ hint.block_stage = BLOCK_GRABBED;
+
+ /* FIXME: there should be some block allocation policy for
+ nodes which contain wander records */
+
+ /* We assume that disk space for wandered record blocks can be
+ * taken from reserved area. */
+ ret = reiser4_alloc_blocks(&hint, &first, &len,
+ BA_FORMATTED | BA_RESERVED |
+ BA_USE_DEFAULT_SEARCH_START,
+ subv);
+ reiser4_blocknr_hint_done(&hint);
+ if (ret)
+ return ret;
+
+ allocated += len;
+
+ /* create jnodes for all wander records */
+ while (len--) {
+ cur = reiser4_alloc_io_head(&first, subv);
+
+ if (cur == NULL) {
+ ret = RETERR(-ENOMEM);
+ goto free_not_assigned;
+ }
+
+ ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
+
+ if (ret != 0) {
+ jfree(cur);
+ goto free_not_assigned;
+ }
+
+ pin_jnode_data(cur);
+
+ list_add_tail(&cur->capture_link, tx_list);
+
+ first++;
+ }
+ }
+
+ { /* format a on-disk linked list of wander records */
+ int serial = 1;
+
+ txhead = list_entry(tx_list->next, jnode, capture_link);
+ format_tx_head(ch, subv_id);
+
+ cur = list_entry(txhead->capture_link.next, jnode, capture_link);
+ while (tx_list != &cur->capture_link) {
+ format_wander_record(ch, subv_id, cur, serial++);
+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
+ }
+ }
+
+ { /* Fill wander records with Wandered Set */
+ struct store_wmap_params params;
+ txn_atom *atom;
+
+ params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
+
+ params.idx = 0;
+ params.capacity =
+ wander_record_capacity(reiser4_get_current_sb());
+
+ atom = get_current_atom_locked();
+ blocknr_set_iterator(atom,
+ &ch_sub->wander_map,
+ &store_wmap_actor, &params, 0, subv_id);
+ spin_unlock_atom(atom);
+ }
+
+ { /* relse all jnodes from tx_list */
+ cur = list_entry(tx_list->next, jnode, capture_link);
+ while (tx_list != &cur->capture_link) {
+ jrelse(cur);
+ cur = list_entry(cur->capture_link.next, jnode, capture_link);
+ }
+ }
+ /*
+ * submit wander records
+ */
+ ret = write_jnode_list_subv(tx_list, fq, NULL, 0, subv);
+
+ return ret;
+
+ free_not_assigned:
+ /*
+ * We deallocate blocks not yet assigned to jnodes on tx_list.
+ * The caller takes care about invalidating of tx list
+ */
+ reiser4_dealloc_blocks(&first, &len,
+ BLOCK_NOT_COUNTED, BA_FORMATTED, subv);
+ return ret;
+}
+
+static int commit_tx_subv(struct commit_handle *ch, u32 subv_id)
+{
+ int ret;
+ flush_queue_t *fq;
+ reiser4_subvol *subv = super_origin(ch->super, subv_id);
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+ /*
+ * Grab more space for wandered records
+ */
+ ret = reiser4_grab_space_force((__u64)(ch_sub->tx_size),
+ BA_RESERVED, subv);
+ if (ret)
+ return ret;
+
+ fq = get_fq_for_current_atom();
+ if (IS_ERR(fq))
+ return PTR_ERR(fq);
+
+ spin_unlock_atom(fq->atom);
+
+ ret = alloc_submit_wander_blocks(ch, subv_id, fq);
+ if (ret)
+ goto exit;
+ ret = alloc_submit_wander_records(ch, subv_id, fq);
+ exit:
+ reiser4_fq_put(fq);
+ return ret;
+}
+
+static int commit_tx(struct commit_handle *ch)
+{
+ int ret;
+
+ txn_atom *atom = ch->atom;
+ struct rb_node *node;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &current_origin(abi->brick_id)->ch;
+
+ if (ch_sub->overwrite_set_size == 0)
+ continue;
+
+ ret = commit_tx_subv(ch, abi->brick_id);
+ if (ret)
+ return ret;
+ }
+ ret = current_atom_finish_all_fq();
+ if (ret)
+ return ret;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &current_origin(abi->brick_id)->ch;
+
+ if (ch_sub->overwrite_set_size == 0)
+ continue;
+
+ ret = update_journal_header(ch, abi->brick_id);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * Play (checkpoint) transaction on a simplest component of a compound volume.
+ * @mirror can be an original subvolume, or a replica.
+ */
+static int play_tx_mirror(struct commit_handle *ch, reiser4_subvol *mirror)
+{
+ int ret;
+ flush_queue_t *fq;
+ struct commit_handle_subvol *ch_sub;
+ /*
+ * replicas don't have their own commit handle,
+ * so borrow it form the original subvolume
+ */
+ ch_sub = &super_origin(ch->super, mirror->id)->ch;
+ fq = get_fq_for_current_atom();
+ if (IS_ERR(fq))
+ return PTR_ERR(fq);
+ spin_unlock_atom(fq->atom);
+
+ ret = write_jnode_list_subv(&ch_sub->overwrite_set, fq,
+ NULL, WRITEOUT_FOR_PAGE_RECLAIM, mirror);
+ reiser4_fq_put(fq);
+ return ret;
+}
+
+/**
+ * Play (checkpoint) transaction on a logical (compound) volume.
+ */
+static int play_tx(struct commit_handle *ch)
+{
+ int ret;
+ struct rb_node *node;
+ txn_atom *atom = ch->atom;
+
+ /*
+ * First of all,
+ * we issue per-component portions of IO requests in parallel.
+ */
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ u32 mirr_id;
+ struct atom_brick_info *abi;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ ch_sub = &current_origin(abi->brick_id)->ch;
+
+ if (ch_sub->overwrite_set_size == 0)
+ continue;
+
+ for_each_mirror(abi->brick_id, mirr_id) {
+
+ reiser4_subvol *mirror;
+ mirror = current_mirror(abi->brick_id, mirr_id);
+ ret = play_tx_mirror(ch, mirror);
+ if (ret)
+ return ret;
+ }
+ }
+ /*
+ * comply with write barriers
+ */
+ ret = current_atom_finish_all_fq();
+ if (ret)
+ return ret;
+
+ for (node = rb_first(&atom->bricks_info);
+ node;
+ node = rb_next(node)) {
+
+ struct atom_brick_info *abi;
+ reiser4_subvol *subv;
+ struct commit_handle_subvol *ch_sub;
+
+ abi = rb_entry(node, struct atom_brick_info, node);
+ subv = current_origin(abi->brick_id);
+ ch_sub = &subv->ch;
+
+ if (ch_sub->overwrite_set_size == 0)
+ continue;
+
+ ret = update_journal_footer(ch, subv);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * We assume that at this moment all captured blocks are marked as RELOC or
+ * WANDER (belong to Relocate or Overwrite set), all nodes from Relocate set
+ * are submitted to write.
+ */
+int reiser4_write_logs(long *nr_submitted)
+{
+ txn_atom *atom;
+ struct super_block *super = reiser4_get_current_sb();
+ reiser4_super_info_data *sbinfo = get_super_private(super);
+ struct commit_handle ch;
+ int ret;
+
+ writeout_mode_enable();
+ /*
+ * block allocator may add jnodes to the clean_list
+ */
+ ret = reiser4_pre_commit_hook();
+ if (ret)
+ return ret;
+ /*
+ * No locks are required if we take atom
+ * whose stage >= ASTAGE_PRE_COMMIT
+ */
+ atom = get_current_context()->trans->atom;
+ assert("zam-965", atom != NULL);
+ /*
+ * relocate set is on the atom->clean_nodes list after
+ * current_atom_complete_writes() finishes. It can be safely
+ * uncaptured after commit_mutex is locked, because any atom that
+ * captures these nodes is guaranteed to commit after current one.
+ *
+ * This can only be done after reiser4_pre_commit_hook(), because
+ * it is where early flushed jnodes with CREATED bit are transferred
+ * to the overwrite list
+ */
+ reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
+ spin_lock_atom(atom);
+ /* There might be waiters for the relocate nodes which we have
+ * released, wake them up. */
+ reiser4_atom_send_event(atom);
+ spin_unlock_atom(atom);
+
+ if (REISER4_DEBUG) {
+ int level;
+
+ for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
+ assert("nikita-3352",
+ list_empty_careful(ATOM_DIRTY_LIST(atom,
+ level)));
+ }
+
+ sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
+ sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
+
+ init_commit_handle(&ch, atom, NULL);
+ /*
+ * count overwrite set and distribute it among subvolumes
+ */
+ ret = get_overwrite_set(&ch);
+
+ if (ret <= 0) {
+ /*
+ * It is possible that overwrite set is empty here,
+ * which means all captured nodes are clean
+ */
+ goto up_and_ret;
+ }
+ /*
+ * Inform the caller about what number of dirty pages
+ * will be submitted to disk
+ */
+ *nr_submitted += ch.total_overwrite_set_size - ch.total_nr_bitmap;
+ /*
+ * count all records needed for storing of the wandered set
+ */
+ get_tx_size(&ch);
+
+ ret = commit_tx(&ch);
+ if (ret)
+ goto up_and_ret;
+
+ spin_lock_atom(atom);
+ reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
+ spin_unlock_atom(atom);
+ reiser4_post_commit_hook();
+
+ ret = play_tx(&ch);
+ up_and_ret:
+ if (ret) {
+ /*
+ * there could be fq attached to current atom;
+ * the only way to remove them is:
+ */
+ current_atom_finish_all_fq();
+ }
+ /*
+ * free blocks of flushed transaction
+ */
+ dealloc_tx_list(&ch);
+ dealloc_wmap(&ch);
+
+ reiser4_post_write_back_hook();
+
+ put_overwrite_set(&ch);
+
+ done_commit_handle(&ch, NULL);
+
+ writeout_mode_disable();
+
+ return ret;
+}
+
+/**
+ * consistency checks for journal data/control blocks: header, footer, log
+ * records, transactions head blocks. All functions return zero on success
+ */
+static int check_journal_header(const jnode * node UNUSED_ARG)
+{
+ /* FIXME: journal header has no magic field yet. */
+ return 0;
+}
+
+/**
+ * wait for write completion for all jnodes from given list
+ */
+static int wait_on_jnode_list(struct list_head *head)
+{
+ jnode *scan;
+ int ret = 0;
+
+ list_for_each_entry(scan, head, capture_link) {
+ struct page *pg = jnode_page(scan);
+
+ if (pg) {
+ if (PageWriteback(pg))
+ wait_on_page_writeback(pg);
+
+ if (PageError(pg))
+ ret++;
+ }
+ }
+ return ret;
+}
+
+static int check_journal_footer(const jnode * node UNUSED_ARG)
+{
+ /* FIXME: journal footer has no magic field yet. */
+ return 0;
+}
+
+static int check_tx_head(const jnode * node)
+{
+ struct tx_header *header = (struct tx_header *)jdata(node);
+
+ if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
+ warning("zam-627", "tx head at block %s corrupted\n",
+ sprint_address(jnode_get_block(node)));
+ return RETERR(-EIO);
+ }
+ return 0;
+}
+
+static int check_wander_record(const jnode * node)
+{
+ struct wander_record_header *RH =
+ (struct wander_record_header *)jdata(node);
+
+ if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
+ 0) {
+ warning("zam-628", "wander record at block %s corrupted\n",
+ sprint_address(jnode_get_block(node)));
+ return RETERR(-EIO);
+ }
+ return 0;
+}
+
+/**
+ * Fill commit_handler structure by everything what is
+ * needed to update journal footer of specified subvolume
+ */
+static int restore_commit_handle(struct commit_handle *ch,
+ reiser4_subvol *subv, jnode *tx_head)
+{
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+ struct tx_header *TXH;
+ int ret;
+
+ ret = jload(tx_head);
+ if (ret)
+ return ret;
+
+ TXH = (struct tx_header *)jdata(tx_head);
+
+ ch_sub->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
+ ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
+ ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
+
+ jrelse(tx_head);
+
+ list_add(&tx_head->capture_link, &ch_sub->tx_list);
+
+ return 0;
+}
+
+/**
+ * Overwrite blocks on permanent location by the wandered set.
+ * and synchronize it with all replicas (if any).
+ * Pre-condition: all replicas of @subv should be already activated.
+ */
+static int replay_tx_subv(reiser4_subvol *subv)
+{
+ int ret;
+ u32 repl_id;
+ const u32 orig_id = subv->id;
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+
+ assert("edward-1708", is_origin(subv));
+ /*
+ * first replay on the original subvolume
+ */
+ write_jnode_list_subv(&ch_sub->overwrite_set,
+ NULL, NULL, 0, subv);
+ ret = wait_on_jnode_list(&ch_sub->overwrite_set);
+ if (ret)
+ goto error;
+ /*
+ * then replay on its replicas, if any
+ */
+ __for_each_replica(subv, repl_id) {
+ reiser4_subvol *repl = super_mirror(subv->super,
+ orig_id, repl_id);
+ write_jnode_list_subv(&ch_sub->overwrite_set,
+ NULL, NULL, 0, repl);
+ ret = wait_on_jnode_list(&ch_sub->overwrite_set);
+ if (ret)
+ goto error;
+ }
+ return 0;
+ error:
+ warning("edward-1712",
+ "transaction replay failed on %s (%d)", subv->name, ret);
+ return RETERR(-EIO);
+}
+
+/**
+ * This is an "offline" version of play_tx(). Called at mount time.
+ * Replay one transaction: restore and write overwrite set in place
+ */
+static int replay_tx(jnode *tx_head,
+ const reiser4_block_nr *log_rec_block_p,
+ const reiser4_block_nr *end_block,
+ unsigned int nr_wander_records,
+ reiser4_subvol *subv)
+{
+ int ret;
+ jnode *log;
+ struct commit_handle ch;
+ struct commit_handle_subvol *ch_sub = &subv->ch;
+ reiser4_block_nr log_rec_block = *log_rec_block_p;
+
+ assert("edward-1713", !is_replica(subv));
+
+ init_commit_handle(&ch, NULL, subv);
+ restore_commit_handle(&ch, subv, tx_head);
+
+ while (log_rec_block != *end_block) {
+ struct wander_record_header *header;
+ struct wander_entry *entry;
+
+ int i;
+
+ if (nr_wander_records == 0) {
+ warning("zam-631",
+ "number of wander records in the linked list"
+ " greater than number stored in tx head.\n");
+ ret = RETERR(-EIO);
+ goto free_ow_set;
+ }
+
+ log = reiser4_alloc_io_head(&log_rec_block, subv);
+ if (log == NULL)
+ return RETERR(-ENOMEM);
+
+ ret = jload(log);
+ if (ret < 0) {
+ reiser4_drop_io_head(log);
+ return ret;
+ }
+
+ ret = check_wander_record(log);
+ if (ret) {
+ jrelse(log);
+ reiser4_drop_io_head(log);
+ return ret;
+ }
+
+ header = (struct wander_record_header *)jdata(log);
+ log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
+
+ entry = (struct wander_entry *)(header + 1);
+ /*
+ * restore overwrite set from wander record content
+ */
+ for (i = 0; i < wander_record_capacity(subv->super); i++) {
+ reiser4_block_nr block;
+ jnode *node;
+
+ block = le64_to_cpu(get_unaligned(&entry->wandered));
+ if (block == 0)
+ break;
+
+ node = reiser4_alloc_io_head(&block, subv);
+ if (node == NULL) {
+ ret = RETERR(-ENOMEM);
+ /*
+ * FIXME-VS:???
+ */
+ jrelse(log);
+ reiser4_drop_io_head(log);
+ goto free_ow_set;
+ }
+
+ ret = jload(node);
+
+ if (ret < 0) {
+ reiser4_drop_io_head(node);
+ /*
+ * FIXME-VS:???
+ */
+ jrelse(log);
+ reiser4_drop_io_head(log);
+ goto free_ow_set;
+ }
+
+ block = le64_to_cpu(get_unaligned(&entry->original));
+
+ assert("zam-603", block != 0);
+
+ jnode_set_block(node, &block);
+
+ list_add_tail(&node->capture_link,
+ &ch_sub->overwrite_set);
+
+ ++entry;
+ }
+
+ jrelse(log);
+ reiser4_drop_io_head(log);
+
+ --nr_wander_records;
+ }
+
+ if (nr_wander_records != 0) {
+ warning("zam-632",
+ "number of wander records in the linked list "
+ "is less than number stored in tx head.\n");
+ ret = RETERR(-EIO);
+ goto free_ow_set;
+ }
+ ret = replay_tx_subv(subv);
+ ret = update_journal_footer(&ch, subv);
+
+ free_ow_set:
+
+ while (!list_empty(&ch_sub->overwrite_set)) {
+ jnode *cur = list_entry(ch_sub->overwrite_set.next,
+ jnode, capture_link);
+ list_del_init(&cur->capture_link);
+ jrelse(cur);
+ reiser4_drop_io_head(cur);
+ }
+
+ list_del_init(&tx_head->capture_link);
+
+ done_commit_handle(&ch, subv);
+
+ return ret;
+}
+
+/**
+ * Find oldest committed and not played transaction and play it. The transaction
+ * was committed and journal header block was updated but the blocks from the
+ * process of writing the atom's overwrite set in-place and updating of journal
+ * footer block were not completed. This function completes the process by
+ * recovering the atom's overwrite set from their wandered locations and writes
+ * them in-place and updating the journal footer.
+ */
+static int replay_oldest_transaction(reiser4_subvol *subv)
+{
+ jnode *jf = subv->journal_footer;
+ unsigned int total;
+ struct journal_footer *F;
+ struct tx_header *T;
+
+ reiser4_block_nr prev_tx;
+ reiser4_block_nr last_flushed_tx;
+ reiser4_block_nr log_rec_block = 0;
+
+ jnode *tx_head;
+
+ int ret;
+
+ if ((ret = jload(jf)) < 0)
+ return ret;
+
+ F = (struct journal_footer *)jdata(jf);
+
+ last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
+
+ jrelse(jf);
+
+ if (subv->last_committed_tx == last_flushed_tx) {
+ /* all transactions are replayed */
+ return 0;
+ }
+
+ prev_tx = subv->last_committed_tx;
+ /*
+ * searching for oldest not flushed transaction
+ */
+ while (1) {
+ tx_head = reiser4_alloc_io_head(&prev_tx, subv);
+ if (!tx_head)
+ return RETERR(-ENOMEM);
+
+ ret = jload(tx_head);
+ if (ret < 0) {
+ reiser4_drop_io_head(tx_head);
+ return ret;
+ }
+
+ ret = check_tx_head(tx_head);
+ if (ret) {
+ jrelse(tx_head);
+ reiser4_drop_io_head(tx_head);
+ return ret;
+ }
+
+ T = (struct tx_header *)jdata(tx_head);
+
+ prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
+
+ if (prev_tx == last_flushed_tx)
+ break;
+
+ jrelse(tx_head);
+ reiser4_drop_io_head(tx_head);
+ }
+
+ total = le32_to_cpu(get_unaligned(&T->total));
+ log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
+
+ pin_jnode_data(tx_head);
+ jrelse(tx_head);
+
+ ret = replay_tx(tx_head, &log_rec_block,
+ jnode_get_block(tx_head), total - 1, subv);
+
+ unpin_jnode_data(tx_head);
+ reiser4_drop_io_head(tx_head);
+
+ if (ret)
+ return ret;
+ return -E_REPEAT;
+}
+
+/**
+ * The reiser4 journal current implementation was optimized to not to capture
+ * super block if certain super blocks fields are modified. Currently, the set
+ * is (<free block count>, <OID allocator>). These fields are logged by
+ * special way which includes storing them in each transaction head block at
+ * atom commit time and writing that information to journal footer block at
+ * atom flush time. For getting the info from journal footer block to the
+ * in-memory super block there is a special function
+ * reiser4_journal_recover_sb_data() which should be called after disk format
+ * plugin re-reads super block after journal replaying.
+ *
+ * Get the information from journal footer to in-memory super block
+ */
+int reiser4_journal_recover_sb_data(struct super_block *s, reiser4_subvol *subv)
+{
+ struct journal_footer *jf;
+ int ret;
+
+ assert("zam-673", subv->journal_footer != NULL);
+
+ ret = jload(subv->journal_footer);
+ if (ret != 0)
+ return ret;
+
+ ret = check_journal_footer(subv->journal_footer);
+ if (ret != 0)
+ goto out;
+
+ jf = (struct journal_footer *)jdata(subv->journal_footer);
+ /*
+ * was there at least one flushed transaction?
+ */
+ if (jf->last_flushed_tx) {
+ /*
+ * restore free block counter logged in this transaction
+ */
+ reiser4_subvol_set_free_blocks(subv,
+ le64_to_cpu(get_unaligned(&jf->free_blocks)));
+ if (is_meta_brick_id(subv->id))
+ /*
+ * restore oid allocator state
+ */
+ oid_init_allocator(s,
+ le64_to_cpu(get_unaligned(&jf->nr_files)),
+ le64_to_cpu(get_unaligned(&jf->next_oid)));
+ }
+ out:
+ jrelse(subv->journal_footer);
+ return ret;
+}
+
+/**
+ * reiser4 replay journal procedure
+ */
+int reiser4_journal_replay(reiser4_subvol *subv)
+{
+ jnode *jh, *jf;
+ struct journal_header *header;
+ int nr_tx_replayed = 0;
+ int ret;
+
+ assert("edward-1714", subv != NULL);
+
+ jh = subv->journal_header;
+ jf = subv->journal_footer;
+
+ if (!jh || !jf) {
+ /*
+ * It is possible that disk layout does not
+ * support journal structures, we just warn about this
+ */
+ warning("zam-583",
+ "Journal control blocks were not loaded on %s. "
+ "Journal replay is not possible.\n", subv->name);
+ return 0;
+ }
+ /*
+ * Take free block count from journal footer block. The free block
+ * counter value corresponds the last flushed transaction state
+ */
+ ret = jload(jf);
+ if (ret < 0)
+ return ret;
+
+ ret = check_journal_footer(jf);
+ if (ret) {
+ jrelse(jf);
+ return ret;
+ }
+ jrelse(jf);
+ /*
+ * store last committed transaction info in
+ * reiser4 in-memory superblock
+ */
+ ret = jload(jh);
+ if (ret < 0)
+ return ret;
+
+ ret = check_journal_header(jh);
+ if (ret) {
+ jrelse(jh);
+ return ret;
+ }
+ header = (struct journal_header *)jdata(jh);
+ subv->last_committed_tx =
+ le64_to_cpu(get_unaligned(&header->last_committed_tx));
+
+ jrelse(jh);
+
+ /* replay committed transactions */
+ while ((ret = replay_oldest_transaction(subv)) == -E_REPEAT)
+ nr_tx_replayed++;
+
+ return ret;
+}
+
+/**
+ * Load journal control block (either journal header or journal footer block)
+ */
+static int load_journal_control_block(jnode **node,
+ const reiser4_block_nr *block,
+ reiser4_subvol *subv)
+{
+ int ret;
+
+ *node = reiser4_alloc_io_head(block, subv);
+ if (!(*node))
+ return RETERR(-ENOMEM);
+
+ ret = jload(*node);
+
+ if (ret) {
+ reiser4_drop_io_head(*node);
+ *node = NULL;
+ return ret;
+ }
+
+ pin_jnode_data(*node);
+ jrelse(*node);
+
+ return 0;
+}
+
+/**
+ * Unload journal header or footer and free jnode
+ */
+static void unload_journal_control_block(jnode ** node)
+{
+ if (*node) {
+ unpin_jnode_data(*node);
+ reiser4_drop_io_head(*node);
+ *node = NULL;
+ }
+}
+
+/**
+ * Release journal control blocks
+ */
+void reiser4_done_journal_info(reiser4_subvol *subv)
+{
+ unload_journal_control_block(&subv->journal_header);
+ unload_journal_control_block(&subv->journal_footer);
+ rcu_barrier();
+}
+
+/**
+ * Load journal control blocks.
+ * Pre-condition: @subv contains valid journal location
+ */
+int reiser4_init_journal_info(reiser4_subvol *subv)
+{
+ int ret;
+ journal_location *loc = &subv->jloc;
+
+ assert("zam-652", loc->header != 0);
+ assert("zam-653", loc->footer != 0);
+
+ ret = load_journal_control_block(&subv->journal_header,
+ &loc->header, subv);
+ if (ret)
+ return ret;
+
+ ret = load_journal_control_block(&subv->journal_footer,
+ &loc->footer, subv);
+ if (ret)
+ unload_journal_control_block(&subv->journal_header);
+ return ret;
+}
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/wander.h linux-5.10.2/fs/reiser4/wander.h
--- linux-5.10.2.orig/fs/reiser4/wander.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/wander.h 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,138 @@
+/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined (__FS_REISER4_WANDER_H__)
+#define __FS_REISER4_WANDER_H__
+
+#include "dformat.h"
+
+#include <linux/fs.h> /* for struct super_block */
+
+/* REISER4 JOURNAL ON-DISK DATA STRUCTURES */
+
+#define TX_HEADER_MAGIC "TxMagic4"
+#define WANDER_RECORD_MAGIC "LogMagc4"
+
+#define TX_HEADER_MAGIC_SIZE (8)
+#define WANDER_RECORD_MAGIC_SIZE (8)
+
+/* journal header block format */
+struct journal_header {
+ /* last written transaction head location */
+ d64 last_committed_tx;
+};
+
+typedef struct journal_location {
+ reiser4_block_nr footer;
+ reiser4_block_nr header;
+} journal_location;
+
+/* The wander.c head comment describes usage and semantic of all these structures */
+/* journal footer block format */
+struct journal_footer {
+ /* last flushed transaction location. */
+ /* This block number is no more valid after the transaction it points
+ to gets flushed, this number is used only at journal replaying time
+ for detection of the end of on-disk list of committed transactions
+ which were not flushed completely */
+ d64 last_flushed_tx;
+
+ /* free block counter is written in journal footer at transaction
+ flushing , not in super block because free blocks counter is logged
+ by another way than super block fields (root pointer, for
+ example). */
+ d64 free_blocks;
+
+ /* number of used OIDs and maximal used OID are logged separately from
+ super block */
+ d64 nr_files;
+ d64 next_oid;
+};
+
+/* Each wander record (except the first one) has unified format with wander
+ record header followed by an array of log entries */
+struct wander_record_header {
+ /* when there is no predefined location for wander records, this magic
+ string should help reiser4fsck. */
+ char magic[WANDER_RECORD_MAGIC_SIZE];
+
+ /* transaction id */
+ d64 id;
+
+ /* total number of wander records in current transaction */
+ d32 total;
+
+ /* this block number in transaction */
+ d32 serial;
+
+ /* number of previous block in commit */
+ d64 next_block;
+};
+
+/* The first wander record (transaction head) of written transaction has the
+ special format */
+struct tx_header {
+ /* magic string makes first block in transaction different from other
+ logged blocks, it should help fsck. */
+ char magic[TX_HEADER_MAGIC_SIZE];
+
+ /* transaction id */
+ d64 id;
+
+ /* total number of records (including this first tx head) in the
+ transaction */
+ d32 total;
+
+ /* align next field to 8-byte boundary; this field always is zero */
+ d32 padding;
+
+ /* block number of previous transaction head */
+ d64 prev_tx;
+
+ /* next wander record location */
+ d64 next_block;
+
+ /* committed versions of free blocks counter */
+ d64 free_blocks;
+
+ /* number of used OIDs (nr_files) and maximal used OID are logged
+ separately from super block */
+ d64 nr_files;
+ d64 next_oid;
+};
+
+/* A transaction gets written to disk as a set of wander records (each wander
+ record size is fs block) */
+
+/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
+ by zeroes */
+struct wander_entry {
+ d64 original; /* block original location */
+ d64 wandered; /* block wandered location */
+};
+
+/* REISER4 JOURNAL WRITER FUNCTIONS */
+
+extern int reiser4_write_logs(long *);
+extern int reiser4_journal_replay(reiser4_subvol *);
+extern int reiser4_journal_recover_sb_data(struct super_block *,
+ reiser4_subvol *);
+
+extern int reiser4_init_journal_info(reiser4_subvol *subv);
+extern void reiser4_done_journal_info(reiser4_subvol *subv);
+
+extern int write_jnode_list_subv(struct list_head *, flush_queue_t *,
+ long *, int, reiser4_subvol *);
+extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
+
+#endif /* __FS_REISER4_WANDER_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ scroll-step: 1
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/writeout.h linux-5.10.2/fs/reiser4/writeout.h
--- linux-5.10.2.orig/fs/reiser4/writeout.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/writeout.h 2020-12-23 16:07:46.137813407 +0100
@@ -0,0 +1,21 @@
+/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README */
+
+#if !defined (__FS_REISER4_WRITEOUT_H__)
+
+#define WRITEOUT_SINGLE_STREAM (0x1)
+#define WRITEOUT_FOR_PAGE_RECLAIM (0x2)
+#define WRITEOUT_FLUSH_FUA (0x4)
+
+extern int reiser4_get_writeout_flags(void);
+
+#endif /* __FS_REISER4_WRITEOUT_H__ */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 80
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/znode.c linux-5.10.2/fs/reiser4/znode.c
--- linux-5.10.2.orig/fs/reiser4/znode.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/znode.c 2020-12-23 16:07:46.138813421 +0100
@@ -0,0 +1,1046 @@
+/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
+ * reiser4/README */
+/* Znode manipulation functions. */
+/* Znode is the in-memory header for a tree node. It is stored
+ separately from the node itself so that it does not get written to
+ disk. In this respect znode is like buffer head or page head. We
+ also use znodes for additional reiser4 specific purposes:
+
+ . they are organized into tree structure which is a part of whole
+ reiser4 tree.
+ . they are used to implement node grained locking
+ . they are used to keep additional state associated with a
+ node
+ . they contain links to lists used by the transaction manager
+
+ Znode is attached to some variable "block number" which is instance of
+ fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
+ appropriate node being actually loaded in memory. Existence of znode itself
+ is regulated by reference count (->x_count) in it. Each time thread
+ acquires reference to znode through call to zget(), ->x_count is
+ incremented and decremented on call to zput(). Data (content of node) are
+ brought in memory through call to zload(), which also increments ->d_count
+ reference counter. zload can block waiting on IO. Call to zrelse()
+ decreases this counter. Also, ->c_count keeps track of number of child
+ znodes and prevents parent znode from being recycled until all of its
+ children are. ->c_count is decremented whenever child goes out of existence
+ (being actually recycled in zdestroy()) which can be some time after last
+ reference to this child dies if we support some form of LRU cache for
+ znodes.
+
+*/
+/* EVERY ZNODE'S STORY
+
+ 1. His infancy.
+
+ Once upon a time, the znode was born deep inside of zget() by call to
+ zalloc(). At the return from zget() znode had:
+
+ . reference counter (x_count) of 1
+ . assigned block number, marked as used in bitmap
+ . pointer to parent znode. Root znode parent pointer points
+ to its father: "fake" znode. This, in turn, has NULL parent pointer.
+ . hash table linkage
+ . no data loaded from disk
+ . no node plugin
+ . no sibling linkage
+
+ 2. His childhood
+
+ Each node is either brought into memory as a result of tree traversal, or
+ created afresh, creation of the root being a special case of the latter. In
+ either case it's inserted into sibling list. This will typically require
+ some ancillary tree traversing, but ultimately both sibling pointers will
+ exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
+ zjnode.state.
+
+ 3. His youth.
+
+ If znode is bound to already existing node in a tree, its content is read
+ from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
+ in zjnode.state and zdata() function starts to return non null for this
+ znode. zload() further calls zparse() that determines which node layout
+ this node is rendered in, and sets ->nplug on success.
+
+ If znode is for new node just created, memory for it is allocated and
+ zinit_new() function is called to initialise data, according to selected
+ node layout.
+
+ 4. His maturity.
+
+ After this point, znode lingers in memory for some time. Threads can
+ acquire references to znode either by blocknr through call to zget(), or by
+ following a pointer to unallocated znode from internal item. Each time
+ reference to znode is obtained, x_count is increased. Thread can read/write
+ lock znode. Znode data can be loaded through calls to zload(), d_count will
+ be increased appropriately. If all references to znode are released
+ (x_count drops to 0), znode is not recycled immediately. Rather, it is
+ still cached in the hash table in the hope that it will be accessed
+ shortly.
+
+ There are two ways in which znode existence can be terminated:
+
+ . sudden death: node bound to this znode is removed from the tree
+ . overpopulation: znode is purged out of memory due to memory pressure
+
+ 5. His death.
+
+ Death is complex process.
+
+ When we irrevocably commit ourselves to decision to remove node from the
+ tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
+ znode. This is done either in ->kill_hook() of internal item or in
+ reiser4_kill_root() function when tree root is removed.
+
+ At this moment znode still has:
+
+ . locks held on it, necessary write ones
+ . references to it
+ . disk block assigned to it
+ . data loaded from the disk
+ . pending requests for lock
+
+ But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
+ deletion. Node deletion includes two phases. First all ways to get
+ references to that znode (sibling and parent links and hash lookup using
+ block number stored in parent node) should be deleted -- it is done through
+ sibling_list_remove(), also we assume that nobody uses down link from
+ parent node due to its nonexistence or proper parent node locking and
+ nobody uses parent pointers from children due to absence of them. Second we
+ invalidate all pending lock requests which still are on znode's lock
+ request queue, this is done by reiser4_invalidate_lock(). Another
+ JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
+ Once it set all requesters are forced to return -EINVAL from
+ longterm_lock_znode(). Future locking attempts are not possible because all
+ ways to get references to that znode are removed already. Last, node is
+ uncaptured from transaction.
+
+ When last reference to the dying znode is just about to be released,
+ block number for this lock is released and znode is removed from the
+ hash table.
+
+ Now znode can be recycled.
+
+ [it's possible to free bitmap block and remove znode from the hash
+ table when last lock is released. This will result in having
+ referenced but completely orphaned znode]
+
+ 6. Limbo
+
+ As have been mentioned above znodes with reference counter 0 are
+ still cached in a hash table. Once memory pressure increases they are
+ purged out of there [this requires something like LRU list for
+ efficient implementation. LRU list would also greatly simplify
+ implementation of coord cache that would in this case morph to just
+ scanning some initial segment of LRU list]. Data loaded into
+ unreferenced znode are flushed back to the durable storage if
+ necessary and memory is freed. Znodes themselves can be recycled at
+ this point too.
+
+*/
+
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/plugin_header.h"
+#include "plugin/node/node.h"
+#include "plugin/plugin.h"
+#include "txnmgr.h"
+#include "jnode.h"
+#include "znode.h"
+#include "block_alloc.h"
+#include "tree.h"
+#include "tree_walk.h"
+#include "super.h"
+#include "reiser4.h"
+
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+
+static z_hash_table *get_htable(reiser4_tree *,
+ const reiser4_block_nr * const blocknr);
+static z_hash_table *znode_get_htable(const znode *);
+static void zdrop(znode *);
+
+/* hash table support */
+
+/* compare two block numbers for equality. Used by hash-table macros */
+static inline int
+blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
+{
+ assert("nikita-534", b1 != NULL);
+ assert("nikita-535", b2 != NULL);
+
+ return *b1 == *b2;
+}
+
+/* Hash znode by block number. Used by hash-table macros */
+/* Audited by: umka (2002.06.11) */
+static inline __u32
+blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
+{
+ assert("nikita-536", b != NULL);
+
+ return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
+}
+
+/* The hash table definition */
+#define KMALLOC(size) reiser4_vmalloc(size)
+#define KFREE(ptr, size) vfree(ptr)
+TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
+ blknrhashfn, blknreq);
+#undef KFREE
+#undef KMALLOC
+
+/* slab for znodes */
+static struct kmem_cache *znode_cache;
+
+int znode_shift_order;
+
+/**
+ * init_znodes - create znode cache
+ *
+ * Initializes slab cache of znodes. It is part of reiser4 module initialization.
+ */
+int init_znodes(void)
+{
+ znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (znode_cache == NULL)
+ return RETERR(-ENOMEM);
+
+ for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
+ ++znode_shift_order);
+ --znode_shift_order;
+ return 0;
+}
+
+/**
+ * done_znodes - delete znode cache
+ *
+ * This is called on reiser4 module unloading or system shutdown.
+ */
+void done_znodes(void)
+{
+ destroy_reiser4_cache(&znode_cache);
+}
+
+/* call this to initialise tree of znodes */
+int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
+{
+ int result;
+ assert("umka-050", tree != NULL);
+
+ rwlock_init(&tree->dk_lock);
+
+ result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
+ if (result != 0)
+ return result;
+ result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
+ return result;
+}
+
+/* free this znode */
+void zfree(znode * node /* znode to free */ )
+{
+ assert("nikita-465", node != NULL);
+ assert("nikita-2120", znode_page(node) == NULL);
+ assert("nikita-2301", list_empty_careful(&node->lock.owners));
+ assert("nikita-2302", list_empty_careful(&node->lock.requestors));
+ assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
+ NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
+ assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
+ assert("nikita-3293", !znode_is_right_connected(node));
+ assert("nikita-3294", !znode_is_left_connected(node));
+ assert("nikita-3295", node->left == NULL);
+ assert("nikita-3296", node->right == NULL);
+
+ /* not yet phash_jnode_destroy(ZJNODE(node)); */
+
+ kmem_cache_free(znode_cache, node);
+}
+
+/* call this to free tree of znodes */
+void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
+{
+ znode *node;
+ znode *next;
+ z_hash_table *ztable;
+
+ /* scan znode hash-tables and kill all znodes, then free hash tables
+ * themselves. */
+
+ assert("nikita-795", tree != NULL);
+
+ ztable = &tree->zhash_table;
+
+ if (ztable->_table != NULL) {
+ for_all_in_htable(ztable, z, node, next) {
+ node->c_count = 0;
+ node->in_parent.node = NULL;
+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
+ zdrop(node);
+ }
+
+ z_hash_done(&tree->zhash_table);
+ }
+
+ ztable = &tree->zfake_table;
+
+ if (ztable->_table != NULL) {
+ for_all_in_htable(ztable, z, node, next) {
+ node->c_count = 0;
+ node->in_parent.node = NULL;
+ assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
+ zdrop(node);
+ }
+
+ z_hash_done(&tree->zfake_table);
+ }
+}
+
+/* ZNODE STRUCTURES */
+
+/* allocate fresh znode */
+znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
+{
+ znode *node;
+
+ node = kmem_cache_alloc(znode_cache, gfp_flag);
+ return node;
+}
+
+/*
+ * Initialize fields of znode
+ * @node: znode to initialize;
+ * @parent: parent znode;
+ * @subvol: subvolume we are in
+ */
+void zinit(znode *node, const znode *parent, struct reiser4_subvol *subvol)
+{
+ assert("nikita-466", node != NULL);
+ assert("edward-1801", subvol != NULL);
+
+ memset(node, 0, sizeof *node);
+ jnode_init(&node->zjnode, subvol, JNODE_FORMATTED_BLOCK);
+ reiser4_init_lock(&node->lock);
+ init_parent_coord(&node->in_parent, parent);
+}
+
+/*
+ * remove znode from indices. This is called jput() when last reference on
+ * znode is released.
+ */
+void znode_remove(znode *node)
+{
+#if REISER4_DEBUG
+ reiser4_super_info_data *sbinfo;
+
+ assert("nikita-2108", node != NULL);
+ assert("nikita-470", node->c_count == 0);
+
+ sbinfo = get_super_private(znode_get_super(node));
+ assert_rw_write_locked(&(sbinfo->tree_lock));
+#endif
+ /* remove reference to this znode from cbk cache */
+ cbk_cache_invalidate(node, znode_get_tree(node));
+
+ /* update c_count of parent */
+ if (znode_parent(node) != NULL) {
+ assert("nikita-472", znode_parent(node)->c_count > 0);
+ /* father, onto your hands I forward my spirit... */
+ znode_parent(node)->c_count--;
+ node->in_parent.node = NULL;
+ } else {
+ /* orphaned znode?! Root? */
+ }
+
+ /* remove znode from hash-table */
+ z_hash_remove_rcu(znode_get_htable(node), node);
+}
+
+/* zdrop() -- Remove znode from the tree.
+
+ This is called when znode is removed from the memory. */
+static void zdrop(znode * node /* znode to finish with */ )
+{
+ jdrop(ZJNODE(node));
+}
+
+/*
+ * put znode into right place in the hash table. This is called by relocate
+ * code.
+ */
+int znode_rehash(znode * node /* node to rehash */ ,
+ const reiser4_block_nr * new_block_nr /* new block number */ )
+{
+ z_hash_table *oldtable;
+ z_hash_table *newtable;
+ reiser4_tree *tree;
+
+ assert("nikita-2018", node != NULL);
+
+ tree = znode_get_tree(node);
+ oldtable = znode_get_htable(node);
+ newtable = get_htable(tree, new_block_nr);
+
+ write_lock_tree();
+ /* remove znode from hash-table */
+ z_hash_remove_rcu(oldtable, node);
+
+ /* assertion no longer valid due to RCU */
+ /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
+
+ /* update blocknr */
+ znode_set_block(node, new_block_nr);
+ node->zjnode.key.z = *new_block_nr;
+
+ /* insert it into hash */
+ z_hash_insert_rcu(newtable, node);
+ write_unlock_tree();
+ return 0;
+}
+
+/* ZNODE LOOKUP, GET, PUT */
+
+/* zlook() - get znode with given block_nr in a hash table or return NULL
+
+ If result is non-NULL then the znode's x_count is incremented. Internal version
+ accepts pre-computed hash index. The hash table is accessed under caller's
+ tree->hash_lock.
+*/
+znode *zlook(reiser4_tree *tree, const reiser4_block_nr *const blocknr)
+{
+ znode *result;
+ __u32 hash;
+ z_hash_table *htable;
+
+ assert("jmacd-506", tree != NULL);
+ assert("jmacd-507", blocknr != NULL);
+
+ htable = get_htable(tree, blocknr);
+ hash = blknrhashfn(htable, blocknr);
+
+ rcu_read_lock();
+ result = z_hash_find_index(htable, hash, blocknr);
+
+ if (result != NULL) {
+ add_x_ref(ZJNODE(result));
+ result = znode_rip_check(result);
+ }
+ rcu_read_unlock();
+
+ return result;
+}
+
+/* return hash table where znode with block @blocknr is (or should be)
+ * stored */
+static z_hash_table *get_htable(reiser4_tree * tree,
+ const reiser4_block_nr * const blocknr)
+{
+ z_hash_table *table;
+ if (is_disk_addr_unallocated(blocknr))
+ table = &tree->zfake_table;
+ else
+ table = &tree->zhash_table;
+ return table;
+}
+
+/* return hash table where znode @node is (or should be) stored */
+static z_hash_table *znode_get_htable(const znode * node)
+{
+ return get_htable(znode_get_tree(node), znode_get_block(node));
+}
+
+/* zget() - get znode from hash table, allocating it if necessary.
+
+ First a call to zlook, locating a x-referenced znode if one
+ exists. If znode is not found, allocate new one and return. Result
+ is returned with x_count reference increased.
+
+ LOCKS TAKEN: TREE_LOCK, ZNODE_LOCK
+ LOCK ORDERING: NONE
+*/
+znode *zget(struct reiser4_subvol *subv,
+ const reiser4_block_nr * const blocknr,
+ znode * parent, tree_level level, gfp_t gfp_flag)
+{
+ znode *result;
+ __u32 hashi;
+ reiser4_tree *tree = &subv->tree;
+ z_hash_table *zth;
+
+ assert("jmacd-513", blocknr != NULL);
+ assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
+
+ zth = get_htable(tree, blocknr);
+ hashi = blknrhashfn(zth, blocknr);
+
+ /* NOTE-NIKITA address-as-unallocated-blocknr still is not
+ implemented. */
+
+ z_hash_prefetch_bucket(zth, hashi);
+
+ rcu_read_lock();
+ /* Find a matching BLOCKNR in the hash table. If the znode is found,
+ we obtain an reference (x_count) but the znode remains unlocked.
+ Have to worry about race conditions later. */
+ result = z_hash_find_index(zth, hashi, blocknr);
+ /* According to the current design, the hash table lock protects new
+ znode references. */
+ if (result != NULL) {
+ add_x_ref(ZJNODE(result));
+ /* NOTE-NIKITA it should be so, but special case during
+ creation of new root makes such assertion highly
+ complicated. */
+ assert("nikita-2131", 1 || znode_parent(result) == parent ||
+ (ZF_ISSET(result, JNODE_ORPHAN)
+ && (znode_parent(result) == NULL)));
+ result = znode_rip_check(result);
+ }
+
+ rcu_read_unlock();
+
+ if (!result) {
+ znode *shadow;
+
+ result = zalloc(gfp_flag);
+ if (!result) {
+ return ERR_PTR(RETERR(-ENOMEM));
+ }
+
+ zinit(result, parent, subv);
+ ZJNODE(result)->blocknr = *blocknr;
+ ZJNODE(result)->key.z = *blocknr;
+ result->level = level;
+
+ write_lock_tree();
+
+ shadow = z_hash_find_index(zth, hashi, blocknr);
+ if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
+ jnode_list_remove(ZJNODE(result));
+ zfree(result);
+ result = shadow;
+ } else {
+ result->version = znode_build_version(tree);
+ z_hash_insert_index_rcu(zth, hashi, result);
+
+ if (parent != NULL)
+ ++parent->c_count;
+ }
+
+ add_x_ref(ZJNODE(result));
+
+ write_unlock_tree();
+ }
+
+ assert("intelfx-6",
+ ergo(!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0,
+ reiser4_check_block(blocknr, 1, subv)));
+
+ /* Check for invalid tree level, return -EIO */
+ if (unlikely(znode_get_level(result) != level)) {
+ warning("jmacd-504",
+ "Wrong level for cached block %llu: %i expecting %i",
+ (unsigned long long)(*blocknr), znode_get_level(result),
+ level);
+ zput(result);
+ return ERR_PTR(RETERR(-EIO));
+ }
+
+ assert("nikita-1227", znode_invariant(result));
+
+ return result;
+}
+
+/* ZNODE PLUGINS/DATA */
+
+/**
+ * Guess plugin for node loaded from the disk.
+ * Id of node plugin is stored at the fixed offset
+ * from the beginning of the node
+ *
+ * @node: znode to guess plugin of
+ */
+static node_plugin *znode_guess_plugin(const znode *node)
+{
+ reiser4_subvol *subv;
+
+ assert("nikita-1053", node != NULL);
+ assert("nikita-1055", zdata(node) != NULL);
+
+ subv = znode_get_subvol(node);
+ assert("edward-1802", subv != NULL);
+
+ if (subvol_is_set(subv, SUBVOL_ONE_NODE_PLUGIN)) {
+ return subv->tree.nplug;
+ } else {
+ return node_plugin_by_disk_id
+ (&((common_node_header *) zdata(node))->plugin_id);
+#ifdef GUESS_EXISTS
+ reiser4_plugin *plugin;
+
+ /* NOTE-NIKITA add locking here when dynamic plugins will be
+ * implemented */
+ for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
+ if ((plugin->u.node.guess != NULL)
+ && plugin->u.node.guess(node))
+ return plugin;
+ }
+ warning("nikita-1057", "Cannot guess node plugin");
+ print_znode("node", node);
+ return NULL;
+#endif
+ }
+}
+
+/* parse node header and install ->node_plugin */
+int zparse(znode * node /* znode to parse */ )
+{
+ int result;
+
+ assert("nikita-1233", node != NULL);
+ assert("nikita-2370", zdata(node) != NULL);
+
+ if (node->nplug == NULL) {
+ node_plugin *nplug;
+
+ nplug = znode_guess_plugin(node);
+ if (likely(nplug != NULL)) {
+ result = nplug->parse(node);
+ if (likely(result == 0))
+ node->nplug = nplug;
+ } else {
+ result = RETERR(-EIO);
+ }
+ } else
+ result = 0;
+ return result;
+}
+
+/* zload with readahead */
+int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
+{
+ int result;
+
+ assert("nikita-484", node != NULL);
+ assert("nikita-1377", znode_invariant(node));
+ assert("jmacd-7771", !znode_above_root(node));
+ assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
+ assert("nikita-3016", reiser4_schedulable());
+
+ if (info)
+ formatted_readahead(node, info);
+
+ result = jload(ZJNODE(node));
+ assert("nikita-1378", znode_invariant(node));
+ return result;
+}
+
+/* load content of node into memory */
+int zload(znode *node)
+{
+#if REISER4_DEBUG
+ int ret;
+ ret = zload_ra(node, NULL);
+ assert("edward-2101", ergo(ret != 0, ret < 0));
+ return ret;
+#else
+ return zload_ra(node, NULL);
+#endif
+}
+
+/* call node plugin to initialise newly allocated node. */
+int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
+{
+ return jinit_new(ZJNODE(node), gfp_flags);
+}
+
+/* drop reference to node data. When last reference is dropped, data are
+ unloaded. */
+void zrelse(znode * node /* znode to release references to */ )
+{
+ assert("nikita-1381", znode_invariant(node));
+ jrelse(ZJNODE(node));
+}
+
+/* returns free space in node */
+unsigned znode_free_space(znode * node /* znode to query */ )
+{
+ assert("nikita-852", node != NULL);
+ return node_plugin_by_node(node)->free_space(node);
+}
+
+/* left delimiting key of znode */
+reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
+{
+ assert("nikita-958", node != NULL);
+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
+ assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
+ assert("nikita-30671", node->rd_key_version != 0);
+ return &node->rd_key;
+}
+
+/* right delimiting key of znode */
+reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
+{
+ assert("nikita-974", node != NULL);
+ assert_rw_locked(&(znode_get_tree(node)->dk_lock));
+ assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
+ assert("nikita-30681", node->ld_key_version != 0);
+ return &node->ld_key;
+}
+
+ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
+ )
+
+/* update right-delimiting key of @node */
+reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
+{
+ assert("nikita-2937", node != NULL);
+ assert("nikita-2939", key != NULL);
+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
+ assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
+ assert("nikita-2944",
+ znode_is_any_locked(node) ||
+ znode_get_level(node) != LEAF_LEVEL ||
+ keyge(key, &node->rd_key) ||
+ keyeq(&node->rd_key, reiser4_min_key()) ||
+ ZF_ISSET(node, JNODE_HEARD_BANSHEE));
+
+ node->rd_key = *key;
+ ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
+ return &node->rd_key;
+}
+
+/* update left-delimiting key of @node */
+reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
+{
+ assert("nikita-2940", node != NULL);
+ assert("nikita-2941", key != NULL);
+ assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
+ assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
+ assert("nikita-2943",
+ znode_is_any_locked(node) || keyeq(&node->ld_key,
+ reiser4_min_key()));
+
+ node->ld_key = *key;
+ ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
+ return &node->ld_key;
+}
+
+/* true if @key is inside key range for @node */
+int znode_contains_key(znode * node /* znode to look in */ ,
+ const reiser4_key * key /* key to look for */ )
+{
+ assert("nikita-1237", node != NULL);
+ assert("nikita-1238", key != NULL);
+
+ /* left_delimiting_key <= key <= right_delimiting_key */
+ return keyle(znode_get_ld_key(node), key)
+ && keyle(key, znode_get_rd_key(node));
+}
+
+/* same as znode_contains_key(), but lock dk lock */
+int znode_contains_key_lock(znode * node /* znode to look in */ ,
+ const reiser4_key * key /* key to look for */ )
+{
+ int result;
+
+ assert("umka-056", node != NULL);
+ assert("umka-057", key != NULL);
+
+ read_lock_dk(znode_get_tree(node));
+ result = znode_contains_key(node, key);
+ read_unlock_dk(znode_get_tree(node));
+ return result;
+}
+
+/* get parent pointer, assuming tree is not locked */
+znode *znode_parent_nolock(const znode * node /* child znode */ )
+{
+ assert("nikita-1444", node != NULL);
+ return node->in_parent.node;
+}
+
+/* get parent pointer of znode */
+znode *znode_parent(const znode * node /* child znode */ )
+{
+ assert("nikita-1226", node != NULL);
+ assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
+ return znode_parent_nolock(node);
+}
+
+/* detect uber znode used to protect in-superblock tree root pointer */
+int znode_above_root(const znode * node /* znode to query */ )
+{
+ assert("umka-059", node != NULL);
+
+ return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
+}
+
+/* check that @node is root---that its block number is recorder in the tree as
+ that of root node */
+#if REISER4_DEBUG
+static int znode_is_true_root(const znode *node)
+{
+ assert("umka-060", node != NULL);
+ assert("edward-1803", znode_get_subvol(node) != NULL);
+
+ return disk_addr_eq(znode_get_block(node),
+ &znode_get_tree(node)->root_block);
+}
+#endif
+
+/* check that @node is root */
+int znode_is_root(const znode *node)
+{
+ assert("edward-1804", znode_get_subvol(node) != NULL);
+
+ return znode_get_level(node) == znode_get_tree(node)->height;
+}
+
+/* Returns true is @node was just created by zget() and wasn't ever loaded
+ into memory. */
+/* NIKITA-HANS: yes */
+int znode_just_created(const znode * node)
+{
+ assert("nikita-2188", node != NULL);
+ return (znode_page(node) == NULL);
+}
+
+/* obtain updated ->znode_epoch. See seal.c for description. */
+__u64 znode_build_version(reiser4_tree * tree)
+{
+ __u64 result;
+
+ spin_lock(&tree->epoch_lock);
+ result = ++tree->znode_epoch;
+ spin_unlock(&tree->epoch_lock);
+ return result;
+}
+
+void init_load_count(load_count * dh)
+{
+ assert("nikita-2105", dh != NULL);
+ memset(dh, 0, sizeof *dh);
+}
+
+void done_load_count(load_count * dh)
+{
+ assert("nikita-2106", dh != NULL);
+ if (dh->node != NULL) {
+ for (; dh->d_ref > 0; --dh->d_ref)
+ zrelse(dh->node);
+ dh->node = NULL;
+ }
+}
+
+static int incr_load_count(load_count * dh)
+{
+ int result;
+
+ assert("nikita-2110", dh != NULL);
+ assert("nikita-2111", dh->node != NULL);
+
+ result = zload(dh->node);
+ if (result == 0)
+ ++dh->d_ref;
+ return result;
+}
+
+int incr_load_count_znode(load_count * dh, znode * node)
+{
+ assert("nikita-2107", dh != NULL);
+ assert("nikita-2158", node != NULL);
+ assert("nikita-2109",
+ ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
+
+ dh->node = node;
+ return incr_load_count(dh);
+}
+
+int incr_load_count_jnode(load_count * dh, jnode * node)
+{
+ if (jnode_is_znode(node)) {
+ return incr_load_count_znode(dh, JZNODE(node));
+ }
+ return 0;
+}
+
+void copy_load_count(load_count * new, load_count * old)
+{
+ int ret = 0;
+ done_load_count(new);
+ new->node = old->node;
+ new->d_ref = 0;
+
+ while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
+ }
+
+ assert("jmacd-87589", ret == 0);
+}
+
+void move_load_count(load_count * new, load_count * old)
+{
+ done_load_count(new);
+ new->node = old->node;
+ new->d_ref = old->d_ref;
+ old->node = NULL;
+ old->d_ref = 0;
+}
+
+/* convert parent pointer into coord */
+void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
+{
+ assert("nikita-3204", pcoord != NULL);
+ assert("nikita-3205", coord != NULL);
+
+ coord_init_first_unit_nocheck(coord, pcoord->node);
+ coord_set_item_pos(coord, pcoord->item_pos);
+ coord->between = AT_UNIT;
+}
+
+/* pack coord into parent_coord_t */
+void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
+{
+ assert("nikita-3206", pcoord != NULL);
+ assert("nikita-3207", coord != NULL);
+
+ pcoord->node = coord->node;
+ pcoord->item_pos = coord->item_pos;
+}
+
+/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
+ look for comments there) */
+void init_parent_coord(parent_coord_t * pcoord, const znode * node)
+{
+ pcoord->node = (znode *) node;
+ pcoord->item_pos = (unsigned short)~0;
+}
+
+#if REISER4_DEBUG
+
+/* debugging aid: znode invariant */
+static int znode_invariant_f(const znode * node /* znode to check */ ,
+ char const **msg /* where to store error
+ * message, if any */ )
+{
+#define _ergo(ant, con) \
+ ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
+
+#define _equi(e1, e2) \
+ ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
+
+#define _check(exp) ((*msg) = #exp, (exp))
+
+ return jnode_invariant_f(ZJNODE(node), msg) &&
+ /* [znode-fake] invariant */
+ /* fake znode doesn't have a parent, and */
+ _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
+ /* there is another way to express this very check, and */
+ _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
+ /* it has special block number, and */
+ _ergo(znode_get_level(node) == 0,
+ disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
+ /* it is the only znode with such block number, and */
+ _ergo(!znode_above_root(node) && znode_is_loaded(node),
+ !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
+ /* it is parent of the tree root node */
+ _ergo(znode_is_true_root(node),
+ znode_above_root(znode_parent(node))) &&
+ /* [znode-level] invariant */
+ /* level of parent znode is one larger than that of child,
+ except for the fake znode, and */
+ _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
+ znode_get_level(znode_parent(node)) ==
+ znode_get_level(node) + 1) &&
+ /* left neighbor is at the same level, and */
+ _ergo(znode_is_left_connected(node) && node->left != NULL,
+ znode_get_level(node) == znode_get_level(node->left)) &&
+ /* right neighbor is at the same level */
+ _ergo(znode_is_right_connected(node) && node->right != NULL,
+ znode_get_level(node) == znode_get_level(node->right)) &&
+ /* [znode-connected] invariant */
+ _ergo(node->left != NULL, znode_is_left_connected(node)) &&
+ _ergo(node->right != NULL, znode_is_right_connected(node)) &&
+ _ergo(!znode_is_root(node) && node->left != NULL,
+ znode_is_right_connected(node->left) &&
+ node->left->right == node) &&
+ _ergo(!znode_is_root(node) && node->right != NULL,
+ znode_is_left_connected(node->right) &&
+ node->right->left == node) &&
+ /* [znode-c_count] invariant */
+ /* for any znode, c_count of its parent is greater than 0 */
+ _ergo(znode_parent(node) != NULL &&
+ !znode_above_root(znode_parent(node)),
+ znode_parent(node)->c_count > 0) &&
+ /* leaves don't have children */
+ _ergo(znode_get_level(node) == LEAF_LEVEL,
+ node->c_count == 0) &&
+ _check(node->zjnode.jnodes.prev != NULL) &&
+ _check(node->zjnode.jnodes.next != NULL) &&
+ /* orphan doesn't have a parent */
+ _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
+ /* [znode-modify] invariant */
+ /* if znode is not write-locked, its checksum remains
+ * invariant */
+ /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
+ * cannot check this. */
+ /* [znode-refs] invariant */
+ /* only referenced znode can be long-term locked */
+ _ergo(znode_is_locked(node),
+ atomic_read(&ZJNODE(node)->x_count) != 0);
+}
+
+/*
+ * debugging aid: check znode invariant and panic if it doesn't hold
+ * @node: znode to check
+ */
+int znode_invariant(znode *node)
+{
+ char const *failed_msg;
+ int result;
+
+ assert("umka-063", node != NULL);
+ assert("edward-1805", znode_get_subvol(node) != NULL);
+
+ spin_lock_znode(node);
+ read_lock_tree();
+ result = znode_invariant_f(node, &failed_msg);
+ if (!result) {
+ /* print_znode("corrupted node", node); */
+ warning("jmacd-555", "Condition %s failed", failed_msg);
+ }
+ read_unlock_tree();
+ spin_unlock_znode(node);
+ return result;
+}
+
+/* return non-0 iff data are loaded into znode */
+int znode_is_loaded(const znode * node /* znode to query */ )
+{
+ assert("nikita-497", node != NULL);
+ return jnode_is_loaded(ZJNODE(node));
+}
+
+unsigned long znode_times_locked(const znode * z)
+{
+ return z->times_locked;
+}
+
+#endif /* REISER4_DEBUG */
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/fs/reiser4/znode.h linux-5.10.2/fs/reiser4/znode.h
--- linux-5.10.2.orig/fs/reiser4/znode.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-5.10.2/fs/reiser4/znode.h 2020-12-23 16:07:46.138813421 +0100
@@ -0,0 +1,441 @@
+/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
+ * reiser4/README */
+
+/* Declaration of znode (Zam's node). See znode.c for more details. */
+
+#ifndef __ZNODE_H__
+#define __ZNODE_H__
+
+#include "forward.h"
+#include "debug.h"
+#include "dformat.h"
+#include "key.h"
+#include "coord.h"
+#include "plugin/node/node.h"
+#include "jnode.h"
+#include "lock.h"
+#include "readahead.h"
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+#include <asm/atomic.h>
+
+/* znode tracks its position within parent (internal item in a parent node,
+ * that contains znode's block number). */
+typedef struct parent_coord {
+ znode *node;
+ pos_in_node_t item_pos;
+} parent_coord_t;
+
+/* &znode - node in a reiser4 tree.
+
+ NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
+ cacheline pressure.
+
+ Locking:
+
+ Long term: data in a disk node attached to this znode are protected
+ by long term, deadlock aware lock ->lock;
+
+ Spin lock: the following fields are protected by the spin lock:
+
+ ->lock
+
+ Following fields are protected by the global tree lock:
+
+ ->left
+ ->right
+ ->in_parent
+ ->c_count
+
+ Following fields are protected by the global delimiting key lock (dk_lock):
+
+ ->ld_key (to update ->ld_key long-term lock on the node is also required)
+ ->rd_key
+
+ Following fields are protected by the long term lock:
+
+ ->nr_items
+
+ ->node_plugin is never changed once set. This means that after code made
+ itself sure that field is valid it can be accessed without any additional
+ locking.
+
+ ->level is immutable.
+
+ Invariants involving this data-type:
+
+ [znode-fake]
+ [znode-level]
+ [znode-connected]
+ [znode-c_count]
+ [znode-refs]
+ [jnode-refs]
+ [jnode-queued]
+ [znode-modify]
+
+ For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
+ Suggestions for how to do that are desired.*/
+struct znode {
+ /* Embedded jnode. */
+ jnode zjnode;
+
+ /* contains three subfields, node, pos_in_node, and pos_in_unit.
+
+ pos_in_node and pos_in_unit are only hints that are cached to
+ speed up lookups during balancing. They are not required to be up to
+ date. Synched in find_child_ptr().
+
+ This value allows us to avoid expensive binary searches.
+
+ in_parent->node points to the parent of this node, and is NOT a
+ hint.
+ */
+ parent_coord_t in_parent;
+
+ /*
+ * sibling list pointers
+ */
+
+ /* left-neighbor */
+ znode *left;
+ /* right-neighbor */
+ znode *right;
+
+ /* long term lock on node content. This lock supports deadlock
+ detection. See lock.c
+ */
+ zlock lock;
+
+ /* You cannot remove from memory a node that has children in
+ memory. This is because we rely on the fact that parent of given
+ node can always be reached without blocking for io. When reading a
+ node into memory you must increase the c_count of its parent, when
+ removing it from memory you must decrease the c_count. This makes
+ the code simpler, and the cases where it is suboptimal are truly
+ obscure.
+ */
+ int c_count;
+
+ /* plugin of node attached to this znode. NULL if znode is not
+ loaded. */
+ node_plugin *nplug;
+
+ /* version of znode data. This is increased on each modification. This
+ * is necessary to implement seals (see seal.[ch]) efficiently. */
+ __u64 version;
+
+ /* left delimiting key. Necessary to efficiently perform
+ balancing with node-level locking. Kept in memory only. */
+ reiser4_key ld_key;
+ /* right delimiting key. */
+ reiser4_key rd_key;
+
+ /* znode's tree level */
+ __u16 level;
+ /* number of items in this node. This field is modified by node
+ * plugin. */
+ __u16 nr_items;
+
+#if REISER4_DEBUG
+ void *creator;
+ reiser4_key first_key;
+ unsigned long times_locked;
+ int left_version; /* when node->left was updated */
+ int right_version; /* when node->right was updated */
+ int ld_key_version; /* when node->ld_key was updated */
+ int rd_key_version; /* when node->rd_key was updated */
+#endif
+
+} __attribute__ ((aligned(16)));
+
+ON_DEBUG(extern atomic_t delim_key_version;
+ )
+
+/* In general I think these macros should not be exposed. */
+#define znode_is_locked(node) (lock_is_locked(&node->lock))
+#define znode_is_rlocked(node) (lock_is_rlocked(&node->lock))
+#define znode_is_wlocked(node) (lock_is_wlocked(&node->lock))
+#define znode_is_wlocked_once(node) (lock_is_wlocked_once(&node->lock))
+#define znode_can_be_rlocked(node) (lock_can_be_rlocked(&node->lock))
+#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
+/* Macros for accessing the znode state. */
+#define ZF_CLR(p,f) JF_CLR (ZJNODE(p), (f))
+#define ZF_ISSET(p,f) JF_ISSET(ZJNODE(p), (f))
+#define ZF_SET(p,f) JF_SET (ZJNODE(p), (f))
+extern znode *zget(struct reiser4_subvol *subvol,
+ const reiser4_block_nr * const block,
+ znode * parent, tree_level level, gfp_t gfp_flag);
+extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
+extern int zload(znode * node);
+extern int zload_ra(znode * node, ra_info_t * info);
+extern int zinit_new(znode * node, gfp_t gfp_flags);
+extern void zrelse(znode * node);
+extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
+extern void znode_update_csum(znode *node);
+
+/* size of data in znode */
+static inline unsigned
+znode_size(const znode * node UNUSED_ARG /* znode to query */ )
+{
+ assert("nikita-1416", node != NULL);
+ return PAGE_SIZE;
+}
+
+extern void parent_coord_to_coord(const parent_coord_t * pcoord,
+ coord_t * coord);
+extern void coord_to_parent_coord(const coord_t * coord,
+ parent_coord_t * pcoord);
+extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
+
+extern unsigned znode_free_space(znode * node);
+
+extern reiser4_key *znode_get_rd_key(znode * node);
+extern reiser4_key *znode_get_ld_key(znode * node);
+
+extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
+extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
+
+/* `connected' state checks */
+static inline int znode_is_right_connected(const znode * node)
+{
+ return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
+}
+
+static inline int znode_is_left_connected(const znode * node)
+{
+ return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
+}
+
+static inline int znode_is_connected(const znode * node)
+{
+ return znode_is_right_connected(node) && znode_is_left_connected(node);
+}
+
+extern int znode_shift_order;
+extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
+extern void znode_remove(znode *);
+extern znode *znode_parent(const znode * node);
+extern znode *znode_parent_nolock(const znode * node);
+extern int znode_above_root(const znode * node);
+extern int init_znode(jnode *node);
+extern int init_znodes(void);
+extern void done_znodes(void);
+extern int znodes_tree_init(reiser4_tree * ztree);
+extern void znodes_tree_done(reiser4_tree * ztree);
+extern int znode_contains_key(znode * node, const reiser4_key * key);
+extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
+extern unsigned znode_save_free_space(znode * node);
+extern unsigned znode_recover_free_space(znode * node);
+extern znode *zalloc(gfp_t gfp_flag);
+extern void zinit(znode *, const znode *parent, struct reiser4_subvol *);
+extern int zparse(znode * node);
+
+extern int znode_just_created(const znode * node);
+
+extern void zfree(znode * node);
+
+#if REISER4_DEBUG
+extern void print_znode(const char *prefix, const znode * node);
+#else
+#define print_znode( p, n ) noop
+#endif
+
+/* Make it look like various znode functions exist instead of treating znodes as
+ jnodes in znode-specific code. */
+#define znode_page(x) jnode_page ( ZJNODE(x) )
+#define zdata(x) jdata ( ZJNODE(x) )
+#define znode_get_block(x) jnode_get_block ( ZJNODE(x) )
+#define znode_created(x) jnode_created ( ZJNODE(x) )
+#define znode_set_created(x) jnode_set_created ( ZJNODE(x) )
+#define znode_convertible(x) jnode_convertible (ZJNODE(x))
+#define znode_set_convertible(x) jnode_set_convertible (ZJNODE(x))
+
+#define znode_is_dirty(x) jnode_is_dirty ( ZJNODE(x) )
+#define znode_check_dirty(x) jnode_check_dirty ( ZJNODE(x) )
+#define znode_make_clean(x) jnode_make_clean ( ZJNODE(x) )
+#define znode_set_block(x, b) jnode_set_block ( ZJNODE(x), (b) )
+
+#define spin_lock_znode(x) spin_lock_jnode ( ZJNODE(x) )
+#define spin_unlock_znode(x) spin_unlock_jnode ( ZJNODE(x) )
+#define spin_trylock_znode(x) spin_trylock_jnode ( ZJNODE(x) )
+#define spin_znode_is_locked(x) spin_jnode_is_locked ( ZJNODE(x) )
+#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
+
+#if REISER4_DEBUG
+extern int znode_x_count_is_protected(const znode * node);
+extern int znode_invariant(znode * node);
+#endif
+
+/* acquire reference to @node */
+static inline znode *zref(znode * node)
+{
+ /* change of x_count from 0 to 1 is protected by tree spin-lock */
+ return JZNODE(jref(ZJNODE(node)));
+}
+
+/* release reference to @node */
+static inline void zput(znode * node)
+{
+ assert("nikita-3564", znode_invariant(node));
+ jput(ZJNODE(node));
+}
+
+/* get the level field for a znode */
+static inline tree_level znode_get_level(const znode * node)
+{
+ return node->level;
+}
+
+/* get the level field for a jnode */
+static inline tree_level jnode_get_level(const jnode * node)
+{
+ if (jnode_is_znode(node))
+ return znode_get_level(JZNODE(node));
+ else
+ /* unformatted nodes are all at the LEAF_LEVEL and for
+ "semi-formatted" nodes like bitmaps, level doesn't matter. */
+ return LEAF_LEVEL;
+}
+
+/* true if jnode is on leaf level */
+static inline int jnode_is_leaf(const jnode * node)
+{
+ if (jnode_is_znode(node))
+ return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
+ if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
+ return 1;
+ return 0;
+}
+
+static inline struct reiser4_subvol *znode_get_subvol(const znode *node)
+{
+ return jnode_get_subvol(ZJNODE(node));
+}
+
+static inline struct super_block *znode_get_super(const znode *node)
+{
+ return jnode_get_super(ZJNODE(node));
+}
+
+#define znode_get_tree(node) (&znode_get_subvol(node)->tree)
+
+/* resolve race with zput */
+static inline znode *znode_rip_check(znode *node)
+{
+ jnode *j;
+
+ j = jnode_rip_sync(ZJNODE(node));
+ if (likely(j != NULL))
+ node = JZNODE(j);
+ else
+ node = NULL;
+ return node;
+}
+
+#if defined(REISER4_DEBUG)
+int znode_is_loaded(const znode * node /* znode to query */ );
+#endif
+
+extern __u64 znode_build_version(reiser4_tree * tree);
+
+/* Data-handles. A data handle object manages pairing calls to zload() and zrelse(). We
+ must load the data for a node in many places. We could do this by simply calling
+ zload() everywhere, the difficulty arises when we must release the loaded data by
+ calling zrelse. In a function with many possible error/return paths, it requires extra
+ work to figure out which exit paths must call zrelse and those which do not. The data
+ handle automatically calls zrelse for every zload that it is responsible for. In that
+ sense, it acts much like a lock_handle.
+*/
+typedef struct load_count {
+ znode *node;
+ int d_ref;
+} load_count;
+
+extern void init_load_count(load_count * lc); /* Initialize a load_count set the current node to NULL. */
+extern void done_load_count(load_count * dh); /* Finalize a load_count: call zrelse() if necessary */
+extern int incr_load_count_znode(load_count * dh, znode * node); /* Set the argument znode to the current node, call zload(). */
+extern int incr_load_count_jnode(load_count * dh, jnode * node); /* If the argument jnode is formatted, do the same as
+ * incr_load_count_znode, otherwise do nothing (unformatted nodes
+ * don't require zload/zrelse treatment). */
+extern void move_load_count(load_count * new, load_count * old); /* Move the contents of a load_count. Old handle is released. */
+extern void copy_load_count(load_count * new, load_count * old); /* Copy the contents of a load_count. Old handle remains held. */
+
+/* Variable initializers for load_count. */
+#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
+#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
+/* A convenience macro for use in assertions or debug-only code, where loaded
+ data is only required to perform the debugging check. This macro
+ encapsulates an expression inside a pair of calls to zload()/zrelse(). */
+#define WITH_DATA( node, exp ) \
+({ \
+ long __with_dh_result; \
+ znode *__with_dh_node; \
+ \
+ __with_dh_node = ( node ); \
+ __with_dh_result = zload( __with_dh_node ); \
+ if( __with_dh_result == 0 ) { \
+ __with_dh_result = ( long )( exp ); \
+ zrelse( __with_dh_node ); \
+ } \
+ __with_dh_result; \
+})
+
+/* Same as above, but accepts a return value in case zload fails. */
+#define WITH_DATA_RET( node, ret, exp ) \
+({ \
+ int __with_dh_result; \
+ znode *__with_dh_node; \
+ \
+ __with_dh_node = ( node ); \
+ __with_dh_result = zload( __with_dh_node ); \
+ if( __with_dh_result == 0 ) { \
+ __with_dh_result = ( int )( exp ); \
+ zrelse( __with_dh_node ); \
+ } else \
+ __with_dh_result = ( ret ); \
+ __with_dh_result; \
+})
+
+#define WITH_COORD(coord, exp) \
+({ \
+ coord_t *__coord; \
+ \
+ __coord = (coord); \
+ coord_clear_iplug(__coord); \
+ WITH_DATA(__coord->node, exp); \
+})
+
+#if REISER4_DEBUG
+#define STORE_COUNTERS \
+ reiser4_lock_cnt_info __entry_counters = \
+ *reiser4_lock_counters()
+#define CHECK_COUNTERS \
+ON_DEBUG_CONTEXT( \
+({ \
+ __entry_counters.x_refs = reiser4_lock_counters() -> x_refs; \
+ __entry_counters.t_refs = reiser4_lock_counters() -> t_refs; \
+ __entry_counters.d_refs = reiser4_lock_counters() -> d_refs; \
+ assert("nikita-2159", \
+ !memcmp(&__entry_counters, reiser4_lock_counters(), \
+ sizeof __entry_counters)); \
+}) )
+
+#else
+#define STORE_COUNTERS
+#define CHECK_COUNTERS noop
+#endif
+
+/* __ZNODE_H__ */
+#endif
+
+/* Make Linus happy.
+ Local variables:
+ c-indentation-style: "K&R"
+ mode-name: "LC"
+ c-basic-offset: 8
+ tab-width: 8
+ fill-column: 120
+ End:
+*/
diff -urN --no-dereference linux-5.10.2.orig/include/linux/fs.h linux-5.10.2/include/linux/fs.h
--- linux-5.10.2.orig/include/linux/fs.h 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/include/linux/fs.h 2020-12-23 16:07:46.138813421 +0100
@@ -245,7 +245,7 @@
*/
#define FILESYSTEM_MAX_STACK_DEPTH 2
-/**
+/**
* enum positive_aop_returns - aop return codes with specific semantics
*
* @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
@@ -255,7 +255,7 @@
* be a candidate for writeback again in the near
* future. Other callers must be careful to unlock
* the page if they get this return. Returned by
- * writepage();
+ * writepage();
*
* @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
* unlocked it and the page might have been truncated.
@@ -288,6 +288,8 @@
struct address_space;
struct writeback_control;
struct readahead_control;
+struct wb_writeback_work;
+struct bdi_writeback;
/*
* Write life time hint values.
@@ -413,6 +415,7 @@
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
sector_t *span);
void (*swap_deactivate)(struct file *file);
+ int batch_lock_tabu;
};
extern const struct address_space_operations empty_aops;
@@ -1951,6 +1954,14 @@
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
+ long (*writeback_inodes)(struct super_block *sb,
+ struct bdi_writeback *wb,
+ struct writeback_control *wbc,
+ struct wb_writeback_work *work,
+ bool flush_all);
+ void (*sync_inodes) (struct super_block *sb,
+ struct writeback_control *wbc);
+
int (*show_options)(struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
@@ -2620,6 +2631,13 @@
extern int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern int write_inode_now(struct inode *, int);
+extern void writeback_skip_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb);
+extern long generic_writeback_sb_inodes(struct super_block *sb,
+ struct bdi_writeback *wb,
+ struct writeback_control *wbc,
+ struct wb_writeback_work *work,
+ bool flush_all);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
@@ -2855,7 +2873,7 @@
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
-
+
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);
@@ -2865,6 +2883,8 @@
#include <linux/err.h>
/* needed for stackable file system support */
+extern loff_t default_llseek_unlocked(struct file *file, loff_t offset,
+ int whence);
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
@@ -2973,6 +2993,8 @@
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
+ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *ppos);
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
diff -urN --no-dereference linux-5.10.2.orig/include/linux/miscdevice.h linux-5.10.2/include/linux/miscdevice.h
--- linux-5.10.2.orig/include/linux/miscdevice.h 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/include/linux/miscdevice.h 2020-12-23 16:07:46.139813436 +0100
@@ -71,6 +71,7 @@
#define USERIO_MINOR 240
#define VHOST_VSOCK_MINOR 241
#define RFKILL_MINOR 242
+#define REISER4_MINOR 243
#define MISC_DYNAMIC_MINOR 255
struct device;
diff -urN --no-dereference linux-5.10.2.orig/include/linux/mm.h linux-5.10.2/include/linux/mm.h
--- linux-5.10.2.orig/include/linux/mm.h 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/include/linux/mm.h 2020-12-23 16:07:46.139813436 +0100
@@ -1813,6 +1813,7 @@
struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
+int set_page_dirty_notag(struct page *page);
void __cancel_dirty_page(struct page *page);
static inline void cancel_dirty_page(struct page *page)
{
diff -urN --no-dereference linux-5.10.2.orig/include/linux/writeback.h linux-5.10.2/include/linux/writeback.h
--- linux-5.10.2.orig/include/linux/writeback.h 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/include/linux/writeback.h 2020-12-23 16:08:55.166816643 +0100
@@ -185,8 +185,27 @@
}
/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_work {
+ long nr_pages;
+ struct super_block *sb;
+ enum writeback_sync_modes sync_mode;
+ unsigned int tagged_writepages:1;
+ unsigned int for_kupdate:1;
+ unsigned int range_cyclic:1;
+ unsigned int for_background:1;
+ unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+ unsigned int auto_free:1; /* free on completion */
+ enum wb_reason reason; /* why was writeback initiated? */
+
+ struct list_head list; /* pending work list */
+ struct wb_completion *done; /* set if the caller waits */
+};
+
+/*
* fs/fs-writeback.c
- */
+ */
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
diff -urN --no-dereference linux-5.10.2.orig/mm/filemap.c linux-5.10.2/mm/filemap.c
--- linux-5.10.2.orig/mm/filemap.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/mm/filemap.c 2020-12-23 16:07:46.140813451 +0100
@@ -2018,6 +2018,7 @@
return ret;
}
+EXPORT_SYMBOL(find_get_pages_range);
/**
* find_get_pages_contig - gang contiguous pagecache lookup
diff -urN --no-dereference linux-5.10.2.orig/mm/page-writeback.c linux-5.10.2/mm/page-writeback.c
--- linux-5.10.2.orig/mm/page-writeback.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/mm/page-writeback.c 2020-12-23 16:07:46.140813451 +0100
@@ -2528,6 +2528,35 @@
EXPORT_SYMBOL(account_page_redirty);
/*
+ * set_page_dirty_notag() -- similar to __set_page_dirty_nobuffers()
+ * except it doesn't tag the page dirty in the page-cache radix tree.
+ * This means that the address space using this cannot use the regular
+ * filemap ->writepages() helpers and must provide its own means of
+ * tracking and finding non-tagged dirty pages.
+ *
+ * NOTE: furthermore, this version also doesn't handle truncate races.
+ */
+int set_page_dirty_notag(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ lock_page_memcg(page);
+ if (!TestSetPageDirty(page)) {
+ unsigned long flags;
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ local_irq_save(flags);
+ account_page_dirtied(page, mapping);
+ local_irq_restore(flags);
+ unlock_page_memcg(page);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return 1;
+ }
+ unlock_page_memcg(page);
+ return 0;
+}
+EXPORT_SYMBOL(set_page_dirty_notag);
+
+/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
diff -urN --no-dereference linux-5.10.2.orig/mm/truncate.c linux-5.10.2/mm/truncate.c
--- linux-5.10.2.orig/mm/truncate.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/mm/truncate.c 2020-12-23 16:07:46.140813451 +0100
@@ -360,6 +360,13 @@
continue;
}
pagevec_add(&locked_pvec, page);
+ if (mapping->a_ops->batch_lock_tabu)
+ /*
+ * the file system doesn't allow to hold
+ * many pages locked, while calling
+ * ->invalidatepage() for one of them
+ */
+ break;
}
for (i = 0; i < pagevec_count(&locked_pvec); i++)
truncate_cleanup_page(mapping, locked_pvec.pages[i]);
diff -urN --no-dereference linux-5.10.2.orig/mm/vmscan.c linux-5.10.2/mm/vmscan.c
--- linux-5.10.2.orig/mm/vmscan.c 2020-12-21 13:30:08.000000000 +0100
+++ linux-5.10.2/mm/vmscan.c 2020-12-23 16:07:46.141813465 +0100
@@ -3016,7 +3016,11 @@
pg_data_t *last_pgdat;
struct zoneref *z;
struct zone *zone;
+ void *saved;
retry:
+ saved = current->journal_info; /* save journal info */
+ current->journal_info = NULL;
+
delayacct_freepages_start();
if (!cgroup_reclaim(sc))
@@ -3061,6 +3065,8 @@
}
delayacct_freepages_end();
+ /* restore journal info */
+ current->journal_info = saved;
if (sc->nr_reclaimed)
return sc->nr_reclaimed;