1#!/bin/bash
2
3#  Copyright (C) 2018 Oracle.  All Rights Reserved.
4#
5#  Author: Darrick J. Wong <darrick.wong@oracle.com>
6#
7#  This program is free software; you can redistribute it and/or
8#  modify it under the terms of the GNU General Public License
9#  as published by the Free Software Foundation; either version 2
10#  of the License, or (at your option) any later version.
11#
12#  This program is distributed in the hope that it would be useful,
13#  but WITHOUT ANY WARRANTY; without even the implied warranty of
14#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15#  GNU General Public License for more details.
16#
17#  You should have received a copy of the GNU General Public License
18#  along with this program; if not, write the Free Software Foundation,
19#  Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
20
21# Automatically check an LVM-managed filesystem online.
22# We use lvm snapshots to do this, which means that we can only
23# check filesystems in VGs that have at least 256MB (or so) of
24# free space.
25
26PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
27
28if (( $EUID != 0 )); then
29    echo "e2scrub must be run as root"
30    exit 1
31fi
32
33snap_size_mb=256
34fstrim=0
35reap=0
36e2fsck_opts=""
37conffile="@root_sysconfdir@/e2scrub.conf"
38
39test -f "${conffile}" && . "${conffile}"
40
41print_help() {
42	echo "Usage: $0 [OPTIONS] mountpoint | device"
43	echo
44	echo "mountpoint must be on an LVM-managed block device"
45	echo "-n: Show what commands e2scrub would execute."
46	echo "-r: Remove e2scrub snapshot and exit, do not check anything."
47	echo "-t: Run fstrim if successful."
48	echo "-V: Print version information and exit."
49}
50
51print_version() {
52	echo "e2scrub @E2FSPROGS_VERSION@ (@E2FSPROGS_DATE@)"
53}
54
55exitcode() {
56	ret="$1"
57
58	# If we're being run as a service, the return code must fit the LSB
59	# init script action error guidelines, which is to say that we
60	# compress all errors to 1 ("generic or unspecified error", LSB 5.0
61	# section 22.2) and hope the admin will scan the log for what
62	# actually happened.
63
64	# We have to sleep 2 seconds here because journald uses the pid to
65	# connect our log messages to the systemd service.  This is critical
66	# for capturing all the log messages if the scrub fails, because the
67	# fail service uses the service name to gather log messages for the
68	# error report.
69	if [ -n "${SERVICE_MODE}" ]; then
70		test "${ret}" -ne 0 && ret=1
71		sleep 2
72	fi
73
74	exit "${ret}"
75}
76
77while getopts "nrtV" opt; do
78    case "${opt}" in
79	"n") DBG="echo Would execute: " ;;
80	"r") reap=1;;
81	"t") fstrim=1;;
82	"V") print_version; exitcode 0;;
83	*) print_help; exitcode 2;;
84	esac
85done
86shift "$((OPTIND - 1))"
87
88arg="$1"
89if [ -z "${arg}" ]; then
90	print_help
91	exitcode 1
92fi
93
94if ! type lsblk >& /dev/null ; then
95    echo "e2scrub: can't find lsblk --- is util-linux installed?"
96    exitcode 1
97fi
98
99if ! type lvcreate >& /dev/null ; then
100    echo "e2scrub: can't find lvcreate --- is lvm2 installed?"
101    exitcode 1
102fi
103
104# close file descriptor 3 (from cron) since it causes lvm to kvetch
105exec 3<&-
106
107# Find the device for a given mountpoint
108dev_from_mount() {
109	local mountpt="$(realpath "$1")"
110
111	lsblk -o NAME,FSTYPE,MOUNTPOINT -p -P -n 2> /dev/null | while read vars; do
112		eval "${vars}"
113		if [ "${mountpt}" != "${MOUNTPOINT}" ]; then
114			continue
115		fi
116		case "${FSTYPE}" in
117		ext[234])
118			echo "${NAME}"
119			return 0
120			;;
121		esac
122	done
123	return 1
124}
125
126# Check a device argument
127dev_from_arg() {
128	local dev="$1"
129	local fstype="$(lsblk -o FSTYPE -n "${dev}" 2> /dev/null)"
130
131	case "${fstype}" in
132	ext[234])
133		echo "${dev}"
134		return 0
135		;;
136	esac
137	return 1
138}
139
140mnt_from_dev() {
141	local dev="$1"
142
143	if [ -n "${dev}" ]; then
144		lsblk -o MOUNTPOINT -n "${dev}"
145	fi
146}
147
148# Construct block device path and mountpoint from argument
149if [ -b "${arg}" ]; then
150	dev="$(dev_from_arg "${arg}")"
151	mnt="$(mnt_from_dev "${dev}")"
152else
153	dev="$(dev_from_mount "${arg}")"
154	mnt="${arg}"
155fi
156if [ ! -e "${dev}" ]; then
157	echo "${arg}: Not an ext[234] filesystem."
158	print_help
159	exitcode 16
160fi
161
162# Make sure this is an LVM device we can snapshot
163lvm_vars="$(lvs --nameprefixes -o name,vgname,lv_role --noheadings "${dev}" 2> /dev/null)"
164eval "${lvm_vars}"
165if [ -z "${LVM2_VG_NAME}" ] || [ -z "${LVM2_LV_NAME}" ] ||
166   echo "${LVM2_LV_ROLE}" | grep -q "snapshot"; then
167	echo "${arg}: Not connnected to an LVM logical volume."
168	print_help
169	exitcode 16
170fi
171start_time="$(date +'%Y%m%d%H%M%S')"
172snap="${LVM2_LV_NAME}.e2scrub"
173snap_dev="/dev/${LVM2_VG_NAME}/${snap}"
174
175teardown() {
176	# Remove and wait for removal to succeed.
177	${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
178	while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ]; do
179		sleep 0.5
180		${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
181	done
182}
183
184check() {
185	# First we recover the journal, then we see if e2fsck tries any
186	# non-optimization repairs.  If either of these two returns a
187	# non-zero status (errors fixed or remaining) then this fs is bad.
188	E2FSCK_FIXES_ONLY=1
189	export E2FSCK_FIXES_ONLY
190	${DBG} "@root_sbindir@/e2fsck" -E journal_only -p ${e2fsck_opts} "${snap_dev}" || return $?
191	${DBG} "@root_sbindir@/e2fsck" -f -y ${e2fsck_opts} "${snap_dev}"
192}
193
194mark_clean() {
195	${DBG} "@root_sbindir@/tune2fs" -C 0 -T "${start_time}" "${dev}"
196}
197
198mark_corrupt() {
199	${DBG} "@root_sbindir@/tune2fs" -E force_fsck "${dev}"
200}
201
202setup() {
203	# Try to remove snapshot for 30s, bail out if we can't remove it.
204	lveremove_deadline="$(( $(date "+%s") + 30))"
205	${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 2>/dev/null
206	while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ] &&
207	      [ "$(date "+%s")" -lt "${lvremove_deadline}" ]; do
208		sleep 0.5
209		${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}"
210	done
211	if [ -e "${snap_dev}" ]; then
212		echo "${arg}: e2scrub snapshot is in use, cannot check!"
213		return 1
214	fi
215	# Create the snapshot, wait for device to appear.
216	${DBG} lvcreate -s -L "${snap_size_mb}m" -n "${snap}" "${LVM2_VG_NAME}/${LVM2_LV_NAME}"
217	if [ $? -ne 0 ]; then
218		echo "${arg}: e2scrub snapshot FAILED, will not check!"
219		return 1
220	fi
221	${DBG} udevadm settle 2> /dev/null
222	return 0
223}
224
225if [ "${reap}" -gt 0 ]; then
226	if [ -e "${snap_dev}" ]; then
227		teardown 2> /dev/null
228	fi
229	exit 0
230fi
231if ! setup; then
232	exitcode 8
233fi
234trap "teardown; exit 1" EXIT INT QUIT TERM
235
236# Check and react
237check
238case "$?" in
239"0")
240	# Clean check!
241	echo "${arg}: Scrub succeeded."
242	mark_clean
243	teardown
244	trap '' EXIT
245
246	# Trim the free space, which requires the snapshot be deleted.
247	if [ "${fstrim}" -eq 1 ] && [ -d "${mnt}" ] && type fstrim > /dev/null 2>&1; then
248		echo "${arg}: Trimming free space."
249		fstrim -v "${mnt}"
250	fi
251
252	ret=0
253	;;
254"8")
255	# Operational error, what now?
256	echo "${arg}: e2fsck operational error."
257	teardown
258	trap '' EXIT
259	ret=8
260	;;
261*)
262	# fsck failed.  Check if the snapshot is invalid; if so, make a
263	# note of that at the end of the log.  This isn't necessarily a
264	# failure because the mounted fs could have overflowed the
265	# snapshot with regular disk writes /or/ our repair process
266	# could have done it by repairing too much.
267	#
268	# If it's really corrupt we ought to fsck at next boot.
269	is_invalid="$(lvs -o lv_snapshot_invalid --noheadings "${snap_dev}" | awk '{print $1}')"
270	if [ -n "${is_invalid}" ]; then
271		echo "${arg}: Scrub FAILED due to invalid snapshot."
272		ret=8
273	else
274		echo "${arg}: Scrub FAILED due to corruption!  Unmount and run e2fsck -y."
275		mark_corrupt
276		ret=6
277	fi
278	teardown
279	trap '' EXIT
280	;;
281esac
282
283exitcode "${ret}"
284