fsck: add systemd service definitions for automatic online service

Add some systemd service files so that bcachefs can automatically fsck
mounted filesystems in the background.  Hopefully with minimal
disruption to frontend operations.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
This commit is contained in:
Darrick J. Wong 2023-12-04 18:43:31 -08:00 committed by Kent Overstreet
parent 5fa7db9806
commit fbfdd05ac5
7 changed files with 305 additions and 2 deletions

View File

@ -91,9 +91,36 @@ else
ROOT_SBINDIR?=$(PREFIX)/sbin
INITRAMFS_DIR=/etc/initramfs-tools
endif
LIBDIR=$(PREFIX)/lib
PKGCONFIG_SERVICEDIR:=$(shell $(PKG_CONFIG) --variable=systemdsystemunitdir systemd)
ifeq (,$(PKGCONFIG_SERVICEDIR))
$(warning skipping systemd integration)
else
BCACHEFSCK_ARGS=-f -n
systemd_libfiles=\
fsck/bcachefsck_fail
systemd_services=\
fsck/bcachefsck_fail@.service \
fsck/bcachefsck@.service \
fsck/system-bcachefsck.slice
built_scripts+=\
fsck/bcachefsck_fail@.service \
fsck/bcachefsck@.service
%.service: %.service.in
@echo " [SED] $@"
$(Q)sed -e "s|@libdir@|$(LIBDIR)|g" \
-e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@
optional_build+=$(systemd_libfiles) $(systemd_services)
optional_install+=install_systemd
endif # PKGCONFIG_SERVICEDIR
.PHONY: all
all: bcachefs
all: bcachefs $(optional_build)
.PHONY: debug
debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y
@ -157,7 +184,7 @@ cmd_version.o : .version
.PHONY: install
install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
install: bcachefs
install: bcachefs $(optional_install)
$(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/
$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
@ -173,11 +200,17 @@ install: bcachefs
sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
.PHONY: install_systemd
install_systemd: $(systemd_services) $(systemd_libfiles)
$(INSTALL) -m0755 -D $(systemd_libfiles) -t $(DESTDIR)$(LIBDIR)
$(INSTALL) -m0644 -D $(systemd_services) -t $(DESTDIR)$(PKGCONFIG_SERVICEDIR)
.PHONY: clean
clean:
@echo "Cleaning all"
$(Q)$(RM) bcachefs libbcachefs.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED)
$(Q)$(RM) -rf rust-src/*/target
$(Q)$(RM) -f $(built_scripts)
.PHONY: deb
deb: all

View File

@ -2,6 +2,8 @@
set -e
#DEBHELPER#
case "$1" in
configure)
if which update-initramfs >/dev/null; then

View File

@ -2,6 +2,8 @@
set -e
#DEBHELPER#
case "$1" in
remove)
if which update-initramfs >/dev/null; then

View File

@ -0,0 +1,98 @@
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
# Author: Darrick J. Wong <djwong@kernel.org>
[Unit]
Description=Online bcachefsck for %f
OnFailure=bcachefsck@%i.service
Documentation=man:bcachefs(8)
# Explicitly require the capabilities that this program needs
ConditionCapability=CAP_SYS_ADMIN
ConditionCapability=CAP_FOWNER
ConditionCapability=CAP_DAC_OVERRIDE
ConditionCapability=CAP_DAC_READ_SEARCH
ConditionCapability=CAP_SYS_RAWIO
# Must be a mountpoint
ConditionPathIsMountPoint=%f
RequiresMountsFor=%f
[Service]
Type=oneshot
Environment=SERVICE_MODE=1
ExecStart=bcachefs fsck --real-mountpoint /tmp/scrub/ @bcachefsck_args@ %f
SyslogIdentifier=%N
# Run scrub with minimal CPU and IO priority so that nothing else will starve.
IOSchedulingClass=idle
CPUSchedulingPolicy=idle
CPUAccounting=true
Nice=19
# Create the service underneath the background service slice so that we can
# control resource usage.
Slice=system-bcachefsck.slice
# No realtime CPU scheduling
RestrictRealtime=true
# Dynamically create a user that isn't root
DynamicUser=true
# Make the entire filesystem readonly and /home inaccessible, then bind mount
# the filesystem we're supposed to be checking into our private /tmp dir.
# 'norbind' means that we don't bind anything under that original mount.
# This enables checking filesystems mounted under /tmp in the global mount
# namespace.
ProtectSystem=strict
ProtectHome=yes
PrivateTmp=true
BindPaths=%f:/tmp/scrub:norbind
# No network access
PrivateNetwork=true
ProtectHostname=true
RestrictAddressFamilies=none
IPAddressDeny=any
# Don't let the program mess with the kernel configuration at all
ProtectKernelLogs=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectControlGroups=true
ProtectProc=invisible
RestrictNamespaces=true
# Hide everything in /proc, even /proc/mounts
ProcSubset=pid
# Only allow the default personality Linux
LockPersonality=true
# No writable memory pages
MemoryDenyWriteExecute=true
# Don't let our mounts leak out to the host
PrivateMounts=true
# Restrict system calls to the native arch and only enough to get things going
SystemCallArchitectures=native
SystemCallFilter=@system-service
SystemCallFilter=~@privileged
SystemCallFilter=~@resources
SystemCallFilter=~@mount
# bcachefsck needs these privileges to run, and no others
CapabilityBoundingSet=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
AmbientCapabilities=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO
NoNewPrivileges=true
# bcachefsck doesn't create files
UMask=7777
# No access to hardware /dev files except for block devices
ProtectClock=true
DevicePolicy=closed
DeviceAllow=block-*

63
fsck/bcachefsck_fail Executable file
View File

@ -0,0 +1,63 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
# Author: Darrick J. Wong <djwong@kernel.org>
# Email logs of failed bcachefsck and bcachefsck_all unit runs
recipient="$1"
test -z "${recipient}" && exit 0
service="$2"
test -z "${service}" && exit 0
mntpoint="$3"
hostname="$(hostname -f 2>/dev/null)"
test -z "${hostname}" && hostname="${HOSTNAME}"
mailer="$(command -v sendmail)"
if [ ! -x "${mailer}" ]; then
echo "${mailer}: Mailer program not found."
exit 1
fi
fail_mail_mntpoint() {
local scrub_svc
# Turn the mountpoint into a properly escaped systemd instance name
scrub_svc="$(systemd-escape --template "${service}@.service" --path "${mntpoint}")"
cat << ENDL
To: ${recipient}
From: <${service}@${hostname}>
Subject: ${service} failure on ${mntpoint}
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset=UTF-8
So sorry, the automatic ${service} of ${mntpoint} on ${hostname} failed.
Please do not reply to this mesage.
A log of what happened follows:
ENDL
systemctl status --full --lines 4294967295 "${scrub_svc}"
}
fail_mail() {
cat << ENDL
To: ${recipient}
From: <${service}@${hostname}>
Subject: ${service} failure
So sorry, the automatic ${service} on ${hostname} failed.
A log of what happened follows:
ENDL
systemctl status --full --lines 4294967295 "${service}"
}
if [ -n "${mntpoint}" ]; then
fail_mail_mntpoint | "${mailer}" -t -i
else
fail_mail | "${mailer}" -t -i
fi
exit "${PIPESTATUS[1]}"

View File

@ -0,0 +1,75 @@
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
# Author: Darrick J. Wong <djwong@kernel.org>
[Unit]
Description=Online bcachefsck Failure Reporting for %f
Documentation=man:bcachefs(8)
[Service]
Type=oneshot
Environment=EMAIL_ADDR=root
ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefs %f
User=mail
Group=mail
SupplementaryGroups=systemd-journal
# Create the service underneath the background service slice so that we can
# control resource usage.
Slice=system-bcachefsck.slice
# No realtime scheduling
RestrictRealtime=true
# Make the entire filesystem readonly and /home inaccessible.
ProtectSystem=full
ProtectHome=yes
PrivateTmp=true
RestrictSUIDSGID=true
# Emailing reports requires network access, but not the ability to change the
# hostname.
ProtectHostname=true
# Don't let the program mess with the kernel configuration at all
ProtectKernelLogs=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectControlGroups=true
ProtectProc=invisible
RestrictNamespaces=true
# Can't hide /proc because journalctl needs it to find various pieces of log
# information
#ProcSubset=pid
# Only allow the default personality Linux
LockPersonality=true
# No writable memory pages
MemoryDenyWriteExecute=true
# Don't let our mounts leak out to the host
PrivateMounts=true
# Restrict system calls to the native arch and only enough to get things going
SystemCallArchitectures=native
SystemCallFilter=@system-service
SystemCallFilter=~@privileged
SystemCallFilter=~@resources
SystemCallFilter=~@mount
# xfs_scrub needs these privileges to run, and no others
CapabilityBoundingSet=
NoNewPrivileges=true
# Failure reporting shouldn't create world-readable files
UMask=0077
# Clean up any IPC objects when this unit stops
RemoveIPC=true
# No access to hardware device files
PrivateDevices=true
ProtectClock=true

View File

@ -0,0 +1,30 @@
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (C) 2023-2024 Oracle. All Rights Reserved.
# Author: Darrick J. Wong <djwong@kernel.org>
[Unit]
Description=bcachefsck background service slice
Before=slices.target
[Slice]
# If the CPU usage cgroup controller is available, don't use more than 60% of a
# single core for all background processes.
CPUQuota=60%
CPUAccounting=true
[Install]
# As of systemd 249, the systemd cgroupv2 configuration code will drop resource
# controllers from the root and system.slice cgroups at startup if it doesn't
# find any direct dependencies that require a given controller. Newly
# activated units with resource control directives are created under the system
# slice but do not cause a reconfiguration of the slice's resource controllers.
# Hence we cannot put CPUQuota= into the bcachefsck service units directly.
#
# For the CPUQuota directive to have any effect, we must therefore create an
# explicit definition file for the slice that systemd creates to contain the
# bcachefsck instance units (e.g. bcachefsck@.service) and we must configure this
# slice as a dependency of the system slice to establish the direct dependency
# relation.
WantedBy=system.slice