diff --git a/Makefile b/Makefile index aa7341e0..fa9f2ff2 100644 --- a/Makefile +++ b/Makefile @@ -91,9 +91,36 @@ else ROOT_SBINDIR?=$(PREFIX)/sbin INITRAMFS_DIR=/etc/initramfs-tools endif +LIBDIR=$(PREFIX)/lib + +PKGCONFIG_SERVICEDIR:=$(shell $(PKG_CONFIG) --variable=systemdsystemunitdir systemd) +ifeq (,$(PKGCONFIG_SERVICEDIR)) + $(warning skipping systemd integration) +else +BCACHEFSCK_ARGS=-f -n +systemd_libfiles=\ + fsck/bcachefsck_fail + +systemd_services=\ + fsck/bcachefsck_fail@.service \ + fsck/bcachefsck@.service \ + fsck/system-bcachefsck.slice + +built_scripts+=\ + fsck/bcachefsck_fail@.service \ + fsck/bcachefsck@.service + +%.service: %.service.in + @echo " [SED] $@" + $(Q)sed -e "s|@libdir@|$(LIBDIR)|g" \ + -e "s|@bcachefsck_args@|$(BCACHEFSCK_ARGS)|g" < $< > $@ + +optional_build+=$(systemd_libfiles) $(systemd_services) +optional_install+=install_systemd +endif # PKGCONFIG_SERVICEDIR .PHONY: all -all: bcachefs +all: bcachefs $(optional_build) .PHONY: debug debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y @@ -157,7 +184,7 @@ cmd_version.o : .version .PHONY: install install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs -install: bcachefs +install: bcachefs $(optional_install) $(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/ $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT) @@ -173,11 +200,17 @@ install: bcachefs sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK) echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK) +.PHONY: install_systemd +install_systemd: $(systemd_services) $(systemd_libfiles) + $(INSTALL) -m0755 -D $(systemd_libfiles) -t $(DESTDIR)$(LIBDIR) + $(INSTALL) -m0644 -D $(systemd_services) -t $(DESTDIR)$(PKGCONFIG_SERVICEDIR) + .PHONY: clean clean: @echo "Cleaning all" $(Q)$(RM) bcachefs libbcachefs.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED) $(Q)$(RM) -rf rust-src/*/target + $(Q)$(RM) -f $(built_scripts) .PHONY: deb deb: all diff --git a/debian/bcachefs-tools.postinst b/debian/bcachefs-tools.postinst index 483b9619..56dd8905 100644 --- a/debian/bcachefs-tools.postinst +++ b/debian/bcachefs-tools.postinst @@ -2,6 +2,8 @@ set -e +#DEBHELPER# + case "$1" in configure) if which update-initramfs >/dev/null; then diff --git a/debian/bcachefs-tools.postrm b/debian/bcachefs-tools.postrm index 6b6fe8ac..2d913367 100644 --- a/debian/bcachefs-tools.postrm +++ b/debian/bcachefs-tools.postrm @@ -2,6 +2,8 @@ set -e +#DEBHELPER# + case "$1" in remove) if which update-initramfs >/dev/null; then diff --git a/fsck/bcachefsck@.service.in b/fsck/bcachefsck@.service.in new file mode 100644 index 00000000..86c1824c --- /dev/null +++ b/fsck/bcachefsck@.service.in @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong + +[Unit] +Description=Online bcachefsck for %f +OnFailure=bcachefsck@%i.service +Documentation=man:bcachefs(8) + +# Explicitly require the capabilities that this program needs +ConditionCapability=CAP_SYS_ADMIN +ConditionCapability=CAP_FOWNER +ConditionCapability=CAP_DAC_OVERRIDE +ConditionCapability=CAP_DAC_READ_SEARCH +ConditionCapability=CAP_SYS_RAWIO + +# Must be a mountpoint +ConditionPathIsMountPoint=%f +RequiresMountsFor=%f + +[Service] +Type=oneshot +Environment=SERVICE_MODE=1 +ExecStart=bcachefs fsck --real-mountpoint /tmp/scrub/ @bcachefsck_args@ %f +SyslogIdentifier=%N + +# Run scrub with minimal CPU and IO priority so that nothing else will starve. +IOSchedulingClass=idle +CPUSchedulingPolicy=idle +CPUAccounting=true +Nice=19 + +# Create the service underneath the background service slice so that we can +# control resource usage. +Slice=system-bcachefsck.slice + +# No realtime CPU scheduling +RestrictRealtime=true + +# Dynamically create a user that isn't root +DynamicUser=true + +# Make the entire filesystem readonly and /home inaccessible, then bind mount +# the filesystem we're supposed to be checking into our private /tmp dir. +# 'norbind' means that we don't bind anything under that original mount. +# This enables checking filesystems mounted under /tmp in the global mount +# namespace. +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=true +BindPaths=%f:/tmp/scrub:norbind + +# No network access +PrivateNetwork=true +ProtectHostname=true +RestrictAddressFamilies=none +IPAddressDeny=any + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Hide everything in /proc, even /proc/mounts +ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# bcachefsck needs these privileges to run, and no others +CapabilityBoundingSet=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO +AmbientCapabilities=CAP_SYS_ADMIN CAP_FOWNER CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_SYS_RAWIO +NoNewPrivileges=true + +# bcachefsck doesn't create files +UMask=7777 + +# No access to hardware /dev files except for block devices +ProtectClock=true +DevicePolicy=closed +DeviceAllow=block-* diff --git a/fsck/bcachefsck_fail b/fsck/bcachefsck_fail new file mode 100755 index 00000000..283cee70 --- /dev/null +++ b/fsck/bcachefsck_fail @@ -0,0 +1,63 @@ +#!/bin/bash + +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong + +# Email logs of failed bcachefsck and bcachefsck_all unit runs + +recipient="$1" +test -z "${recipient}" && exit 0 +service="$2" +test -z "${service}" && exit 0 +mntpoint="$3" + +hostname="$(hostname -f 2>/dev/null)" +test -z "${hostname}" && hostname="${HOSTNAME}" + +mailer="$(command -v sendmail)" +if [ ! -x "${mailer}" ]; then + echo "${mailer}: Mailer program not found." + exit 1 +fi + +fail_mail_mntpoint() { + local scrub_svc + + # Turn the mountpoint into a properly escaped systemd instance name + scrub_svc="$(systemd-escape --template "${service}@.service" --path "${mntpoint}")" + cat << ENDL +To: ${recipient} +From: <${service}@${hostname}> +Subject: ${service} failure on ${mntpoint} +Content-Transfer-Encoding: 8bit +Content-Type: text/plain; charset=UTF-8 + +So sorry, the automatic ${service} of ${mntpoint} on ${hostname} failed. +Please do not reply to this mesage. + +A log of what happened follows: +ENDL + systemctl status --full --lines 4294967295 "${scrub_svc}" +} + +fail_mail() { + cat << ENDL +To: ${recipient} +From: <${service}@${hostname}> +Subject: ${service} failure + +So sorry, the automatic ${service} on ${hostname} failed. + +A log of what happened follows: +ENDL + systemctl status --full --lines 4294967295 "${service}" +} + +if [ -n "${mntpoint}" ]; then + fail_mail_mntpoint | "${mailer}" -t -i +else + fail_mail | "${mailer}" -t -i +fi +exit "${PIPESTATUS[1]}" diff --git a/fsck/bcachefsck_fail@.service.in b/fsck/bcachefsck_fail@.service.in new file mode 100644 index 00000000..369a809a --- /dev/null +++ b/fsck/bcachefsck_fail@.service.in @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong + +[Unit] +Description=Online bcachefsck Failure Reporting for %f +Documentation=man:bcachefs(8) + +[Service] +Type=oneshot +Environment=EMAIL_ADDR=root +ExecStart=@libdir@/bcachefsck_fail "${EMAIL_ADDR}" bcachefs %f +User=mail +Group=mail +SupplementaryGroups=systemd-journal + +# Create the service underneath the background service slice so that we can +# control resource usage. +Slice=system-bcachefsck.slice + +# No realtime scheduling +RestrictRealtime=true + +# Make the entire filesystem readonly and /home inaccessible. +ProtectSystem=full +ProtectHome=yes +PrivateTmp=true +RestrictSUIDSGID=true + +# Emailing reports requires network access, but not the ability to change the +# hostname. +ProtectHostname=true + +# Don't let the program mess with the kernel configuration at all +ProtectKernelLogs=true +ProtectKernelModules=true +ProtectKernelTunables=true +ProtectControlGroups=true +ProtectProc=invisible +RestrictNamespaces=true + +# Can't hide /proc because journalctl needs it to find various pieces of log +# information +#ProcSubset=pid + +# Only allow the default personality Linux +LockPersonality=true + +# No writable memory pages +MemoryDenyWriteExecute=true + +# Don't let our mounts leak out to the host +PrivateMounts=true + +# Restrict system calls to the native arch and only enough to get things going +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@mount + +# xfs_scrub needs these privileges to run, and no others +CapabilityBoundingSet= +NoNewPrivileges=true + +# Failure reporting shouldn't create world-readable files +UMask=0077 + +# Clean up any IPC objects when this unit stops +RemoveIPC=true + +# No access to hardware device files +PrivateDevices=true +ProtectClock=true diff --git a/fsck/system-bcachefsck.slice b/fsck/system-bcachefsck.slice new file mode 100644 index 00000000..ea368032 --- /dev/null +++ b/fsck/system-bcachefsck.slice @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2023-2024 Oracle. All Rights Reserved. +# Author: Darrick J. Wong + +[Unit] +Description=bcachefsck background service slice +Before=slices.target + +[Slice] + +# If the CPU usage cgroup controller is available, don't use more than 60% of a +# single core for all background processes. +CPUQuota=60% +CPUAccounting=true + +[Install] +# As of systemd 249, the systemd cgroupv2 configuration code will drop resource +# controllers from the root and system.slice cgroups at startup if it doesn't +# find any direct dependencies that require a given controller. Newly +# activated units with resource control directives are created under the system +# slice but do not cause a reconfiguration of the slice's resource controllers. +# Hence we cannot put CPUQuota= into the bcachefsck service units directly. +# +# For the CPUQuota directive to have any effect, we must therefore create an +# explicit definition file for the slice that systemd creates to contain the +# bcachefsck instance units (e.g. bcachefsck@.service) and we must configure this +# slice as a dependency of the system slice to establish the direct dependency +# relation. +WantedBy=system.slice