summary refs log tree commit diff
path: root/nixos/modules/system/boot/stage-1-init.sh
blob: 8fcc1f029723e8e139a3e2ebc44ae771a27fc407 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
#! @shell@

targetRoot=/mnt-root
console=tty1
verbose="@verbose@"

info() {
    if [[ -n "$verbose" ]]; then
        echo "$@"
    fi
}

extraUtils="@extraUtils@"
export LD_LIBRARY_PATH=@extraUtils@/lib
export PATH=@extraUtils@/bin
ln -s @extraUtils@/bin /bin

# Copy the secrets to their needed location
if [ -d "@extraUtils@/secrets" ]; then
    for secret in $(cd "@extraUtils@/secrets"; find . -type f); do
        mkdir -p $(dirname "/$secret")
        ln -s "@extraUtils@/secrets/$secret" "$secret"
    done
fi

# Stop LVM complaining about fd3
export LVM_SUPPRESS_FD_WARNINGS=true

fail() {
    if [ -n "$panicOnFail" ]; then exit 1; fi

    @preFailCommands@

    # If starting stage 2 failed, allow the user to repair the problem
    # in an interactive shell.
    cat <<EOF

An error occurred in stage 1 of the boot process, which must mount the
root filesystem on \`$targetRoot' and then start stage 2.  Press one
of the following keys:

EOF
    if [ -n "$allowShell" ]; then cat <<EOF
  i) to launch an interactive shell
  f) to start an interactive shell having pid 1 (needed if you want to
     start stage 2's init manually)
EOF
    fi
    cat <<EOF
  r) to reboot immediately
  *) to ignore the error and continue
EOF

    read -n 1 reply

    if [ -n "$allowShell" -a "$reply" = f ]; then
        exec setsid @shell@ -c "exec @shell@ < /dev/$console >/dev/$console 2>/dev/$console"
    elif [ -n "$allowShell" -a "$reply" = i ]; then
        echo "Starting interactive shell..."
        setsid @shell@ -c "exec @shell@ < /dev/$console >/dev/$console 2>/dev/$console" || fail
    elif [ "$reply" = r ]; then
        echo "Rebooting..."
        reboot -f
    else
        info "Continuing..."
    fi
}

trap 'fail' 0


# Print a greeting.
info
info "<<< NixOS Stage 1 >>>"
info

# Make several required directories.
mkdir -p /etc/udev
touch /etc/fstab # to shut up mount
ln -s /proc/mounts /etc/mtab # to shut up mke2fs
touch /etc/udev/hwdb.bin # to shut up udev
touch /etc/initrd-release

# Function for waiting for device(s) to appear.
waitDevice() {
    local device="$1"
    # Split device string using ':' as a delimiter as bcachefs
    # uses this for multi-device filesystems, i.e. /dev/sda1:/dev/sda2:/dev/sda3
    local IFS=':'

    # USB storage devices tend to appear with some delay.  It would be
    # great if we had a way to synchronously wait for them, but
    # alas...  So just wait for a few seconds for the device to
    # appear.
    for dev in $device; do
        if test ! -e $dev; then
            echo -n "waiting for device $dev to appear..."
            try=20
            while [ $try -gt 0 ]; do
                sleep 1
                # also re-try lvm activation now that new block devices might have appeared
                lvm vgchange -ay
                # and tell udev to create nodes for the new LVs
                udevadm trigger --action=add
                if test -e $dev; then break; fi
                echo -n "."
                try=$((try - 1))
            done
            echo
            [ $try -ne 0 ]
        fi
    done
}

# Mount special file systems.
specialMount() {
  local device="$1"
  local mountPoint="$2"
  local options="$3"
  local fsType="$4"

  mkdir -m 0755 -p "$mountPoint"
  mount -n -t "$fsType" -o "$options" "$device" "$mountPoint"
}
source @earlyMountScript@

# Copy initrd secrets from /.initrd-secrets to their actual destinations
if [ -d "/.initrd-secrets" ]; then
    #
    # Secrets are named by their full destination pathname and stored
    # under /.initrd-secrets/
    #
    for secret in $(cd "/.initrd-secrets"; find . -type f); do
        mkdir -p $(dirname "/$secret")
        cp "/.initrd-secrets/$secret" "$secret"
    done
fi

# Log the script output to /dev/kmsg or /run/log/stage-1-init.log.
mkdir -p /tmp
mkfifo /tmp/stage-1-init.log.fifo
logOutFd=8 && logErrFd=9
eval "exec $logOutFd>&1 $logErrFd>&2"
if test -w /dev/kmsg; then
    tee -i < /tmp/stage-1-init.log.fifo /proc/self/fd/"$logOutFd" | while read -r line; do
        if test -n "$line"; then
            echo "<7>stage-1-init: [$(date)] $line" > /dev/kmsg
        fi
    done &
else
    mkdir -p /run/log
    tee -i < /tmp/stage-1-init.log.fifo /run/log/stage-1-init.log &
fi
exec > /tmp/stage-1-init.log.fifo 2>&1


# Process the kernel command line.
export stage2Init=/init
for o in $(cat /proc/cmdline); do
    case $o in
        console=*)
            set -- $(IFS==; echo $o)
            params=$2
            set -- $(IFS=,; echo $params)
            console=$1
            ;;
        init=*)
            set -- $(IFS==; echo $o)
            stage2Init=$2
            ;;
        boot.persistence=*)
            set -- $(IFS==; echo $o)
            persistence=$2
            ;;
        boot.persistence.opt=*)
            set -- $(IFS==; echo $o)
            persistence_opt=$2
            ;;
        boot.trace|debugtrace)
            # Show each command.
            set -x
            ;;
        boot.shell_on_fail)
            allowShell=1
            ;;
        boot.debug1|debug1) # stop right away
            allowShell=1
            fail
            ;;
        boot.debug1devices) # stop after loading modules and creating device nodes
            allowShell=1
            debug1devices=1
            ;;
        boot.debug1mounts) # stop after mounting file systems
            allowShell=1
            debug1mounts=1
            ;;
        boot.panic_on_fail|stage1panic=1)
            panicOnFail=1
            ;;
        root=*)
            # If a root device is specified on the kernel command
            # line, make it available through the symlink /dev/root.
            # Recognise LABEL= and UUID= to support UNetbootin.
            set -- $(IFS==; echo $o)
            if [ $2 = "LABEL" ]; then
                root="/dev/disk/by-label/$3"
            elif [ $2 = "UUID" ]; then
                root="/dev/disk/by-uuid/$3"
            else
                root=$2
            fi
            ln -s "$root" /dev/root
            ;;
        copytoram)
            copytoram=1
            ;;
        findiso=*)
            # if an iso name is supplied, try to find the device where
            # the iso resides on
            set -- $(IFS==; echo $o)
            isoPath=$2
            ;;
    esac
done

# Set hostid before modules are loaded.
# This is needed by the spl/zfs modules.
@setHostId@

# Load the required kernel modules.
mkdir -p /lib
ln -s @modulesClosure@/lib/modules /lib/modules
ln -s @modulesClosure@/lib/firmware /lib/firmware
echo @extraUtils@/bin/modprobe > /proc/sys/kernel/modprobe
for i in @kernelModules@; do
    info "loading module $(basename $i)..."
    modprobe $i
done


# Create device nodes in /dev.
@preDeviceCommands@
info "running udev..."
ln -sfn /proc/self/fd /dev/fd
ln -sfn /proc/self/fd/0 /dev/stdin
ln -sfn /proc/self/fd/1 /dev/stdout
ln -sfn /proc/self/fd/2 /dev/stderr
mkdir -p /etc/systemd
ln -sfn @linkUnits@ /etc/systemd/network
mkdir -p /etc/udev
ln -sfn @udevRules@ /etc/udev/rules.d
mkdir -p /dev/.mdadm
systemd-udevd --daemon
udevadm trigger --action=add
udevadm settle


# XXX: Use case usb->lvm will still fail, usb->luks->lvm is covered
@preLVMCommands@

info "starting device mapper and LVM..."
lvm vgchange -ay

if test -n "$debug1devices"; then fail; fi


@postDeviceCommands@


# Check the specified file system, if appropriate.
checkFS() {
    local device="$1"
    local fsType="$2"

    # Only check block devices.
    if [ ! -b "$device" ]; then return 0; fi

    # Don't check ROM filesystems.
    if [ "$fsType" = iso9660 -o "$fsType" = udf ]; then return 0; fi

    # Don't check resilient COWs as they validate the fs structures at mount time
    if [ "$fsType" = btrfs -o "$fsType" = zfs -o "$fsType" = bcachefs ]; then return 0; fi

    # Skip fsck for apfs as the fsck utility does not support repairing the filesystem (no -a option)
    if [ "$fsType" = apfs ]; then return 0; fi

    # Skip fsck for nilfs2 - not needed by design and no fsck tool for this filesystem.
    if [ "$fsType" = nilfs2 ]; then return 0; fi

    # Skip fsck for inherently readonly filesystems.
    if [ "$fsType" = squashfs ]; then return 0; fi

    # If we couldn't figure out the FS type, then skip fsck.
    if [ "$fsType" = auto ]; then
        echo 'cannot check filesystem with type "auto"!'
        return 0
    fi

    # Device might be already mounted manually
    # e.g. NBD-device or the host filesystem of the file which contains encrypted root fs
    if mount | grep -q "^$device on "; then
        echo "skip checking already mounted $device"
        return 0
    fi

    # Optionally, skip fsck on journaling filesystems.  This option is
    # a hack - it's mostly because e2fsck on ext3 takes much longer to
    # recover the journal than the ext3 implementation in the kernel
    # does (minutes versus seconds).
    if test -z "@checkJournalingFS@" -a \
        \( "$fsType" = ext3 -o "$fsType" = ext4 -o "$fsType" = reiserfs \
        -o "$fsType" = xfs -o "$fsType" = jfs -o "$fsType" = f2fs \)
    then
        return 0
    fi

    echo "checking $device..."

    fsckFlags=
    if test "$fsType" != "btrfs"; then
        fsckFlags="-V -a"
    fi
    fsck $fsckFlags "$device"
    fsckResult=$?

    if test $(($fsckResult | 2)) = $fsckResult; then
        echo "fsck finished, rebooting..."
        sleep 3
        reboot -f
    fi

    if test $(($fsckResult | 4)) = $fsckResult; then
        echo "$device has unrepaired errors, please fix them manually."
        fail
    fi

    if test $fsckResult -ge 8; then
        echo "fsck on $device failed."
        fail
    fi

    return 0
}


# Function for mounting a file system.
mountFS() {
    local device="$1"
    local mountPoint="$2"
    local options="$3"
    local fsType="$4"

    if [ "$fsType" = auto ]; then
        fsType=$(blkid -o value -s TYPE "$device")
        if [ -z "$fsType" ]; then fsType=auto; fi
    fi

    # Filter out x- options, which busybox doesn't do yet.
    local optionsFiltered="$(IFS=,; for i in $options; do if [ "${i:0:2}" != "x-" ]; then echo -n $i,; fi; done)"
    # Prefix (lower|upper|work)dir with /mnt-root (overlayfs)
    local optionsPrefixed="$( echo "$optionsFiltered" | sed -E 's#\<(lowerdir|upperdir|workdir)=#\1=/mnt-root#g' )"

    echo "$device /mnt-root$mountPoint $fsType $optionsPrefixed" >> /etc/fstab

    checkFS "$device" "$fsType"

    # Optionally resize the filesystem.
    case $options in
        *x-nixos.autoresize*)
            if [ "$fsType" = ext2 -o "$fsType" = ext3 -o "$fsType" = ext4 ]; then
                modprobe "$fsType"
                echo "resizing $device..."
                e2fsck -fp "$device"
                resize2fs "$device"
            elif [ "$fsType" = f2fs ]; then
                echo "resizing $device..."
                fsck.f2fs -fp "$device"
                resize.f2fs "$device"
            fi
            ;;
    esac

    # Create backing directories for overlayfs
    if [ "$fsType" = overlay ]; then
        for i in upper work; do
             dir="$( echo "$optionsPrefixed" | grep -o "${i}dir=[^,]*" )"
             mkdir -m 0700 -p "${dir##*=}"
        done
    fi

    info "mounting $device on $mountPoint..."

    mkdir -p "/mnt-root$mountPoint"

    # For ZFS and CIFS mounts, retry a few times before giving up.
    # We do this for ZFS as a workaround for issue NixOS/nixpkgs#25383.
    local n=0
    while true; do
        mount "/mnt-root$mountPoint" && break
        if [ \( "$fsType" != cifs -a "$fsType" != zfs \) -o "$n" -ge 10 ]; then fail; break; fi
        echo "retrying..."
        sleep 1
        n=$((n + 1))
    done

    [ "$mountPoint" == "/" ] &&
        [ -f "/mnt-root/etc/NIXOS_LUSTRATE" ] &&
        lustrateRoot "/mnt-root"

    true
}

lustrateRoot () {
    local root="$1"

    echo
    echo -e "\e[1;33m<<< NixOS is now lustrating the root filesystem (cruft goes to /old-root) >>>\e[0m"
    echo

    mkdir -m 0755 -p "$root/old-root.tmp"

    echo
    echo "Moving impurities out of the way:"
    for d in "$root"/*
    do
        [ "$d" == "$root/nix"          ] && continue
        [ "$d" == "$root/boot"         ] && continue # Don't render the system unbootable
        [ "$d" == "$root/old-root.tmp" ] && continue

        mv -v "$d" "$root/old-root.tmp"
    done

    # Use .tmp to make sure subsequent invokations don't clash
    mv -v "$root/old-root.tmp" "$root/old-root"

    mkdir -m 0755 -p "$root/etc"
    touch "$root/etc/NIXOS"

    exec 4< "$root/old-root/etc/NIXOS_LUSTRATE"

    echo
    echo "Restoring selected impurities:"
    while read -u 4 keeper; do
        dirname="$(dirname "$keeper")"
        mkdir -m 0755 -p "$root/$dirname"
        cp -av "$root/old-root/$keeper" "$root/$keeper"
    done

    exec 4>&-
}



if test -e /sys/power/resume -a -e /sys/power/disk; then
    if test -n "@resumeDevice@" && waitDevice "@resumeDevice@"; then
        resumeDev="@resumeDevice@"
        resumeInfo="$(udevadm info -q property "$resumeDev" )"
    else
        for sd in @resumeDevices@; do
            # Try to detect resume device. According to Ubuntu bug:
            # https://bugs.launchpad.net/ubuntu/+source/pm-utils/+bug/923326/comments/1
            # when there are multiple swap devices, we can't know where the hibernate
            # image will reside. We can check all of them for swsuspend blkid.
            if waitDevice "$sd"; then
                resumeInfo="$(udevadm info -q property "$sd")"
                if [ "$(echo "$resumeInfo" | sed -n 's/^ID_FS_TYPE=//p')" = "swsuspend" ]; then
                    resumeDev="$sd"
                    break
                fi
            fi
        done
    fi
    if test -n "$resumeDev"; then
        resumeMajor="$(echo "$resumeInfo" | sed -n 's/^MAJOR=//p')"
        resumeMinor="$(echo "$resumeInfo" | sed -n 's/^MINOR=//p')"
        echo "$resumeMajor:$resumeMinor" > /sys/power/resume 2> /dev/null || echo "failed to resume..."
    fi
fi

# If we have a path to an iso file, find the iso and link it to /dev/root
if [ -n "$isoPath" ]; then
  mkdir -p /findiso

  for delay in 5 10; do
    blkid | while read -r line; do
      device=$(echo "$line" | sed 's/:.*//')
      type=$(echo "$line" | sed 's/.*TYPE="\([^"]*\)".*/\1/')

      mount -t "$type" "$device" /findiso
      if [ -e "/findiso$isoPath" ]; then
        ln -sf "/findiso$isoPath" /dev/root
        break 2
      else
        umount /findiso
      fi
    done

    sleep "$delay"
  done
fi

# Try to find and mount the root device.
mkdir -p $targetRoot

exec 3< @fsInfo@

while read -u 3 mountPoint; do
    read -u 3 device
    read -u 3 fsType
    read -u 3 options

    # !!! Really quick hack to support bind mounts, i.e., where the
    # "device" should be taken relative to /mnt-root, not /.  Assume
    # that every device that starts with / but doesn't start with /dev
    # is a bind mount.
    pseudoDevice=
    case $device in
        /dev/*)
            ;;
        //*)
            # Don't touch SMB/CIFS paths.
            pseudoDevice=1
            ;;
        /*)
            device=/mnt-root$device
            ;;
        *)
            # Not an absolute path; assume that it's a pseudo-device
            # like an NFS path (e.g. "server:/path").
            pseudoDevice=1
            ;;
    esac

    if test -z "$pseudoDevice" && ! waitDevice "$device"; then
        # If it doesn't appear, try to mount it anyway (and
        # probably fail).  This is a fallback for non-device "devices"
        # that we don't properly recognise.
        echo "Timed out waiting for device $device, trying to mount anyway."
    fi

    # Wait once more for the udev queue to empty, just in case it's
    # doing something with $device right now.
    udevadm settle

    # If copytoram is enabled: skip mounting the ISO and copy its content to a tmpfs.
    if [ -n "$copytoram" ] && [ "$device" = /dev/root ] && [ "$mountPoint" = /iso ]; then
      fsType=$(blkid -o value -s TYPE "$device")
      fsSize=$(blockdev --getsize64 "$device" || stat -Lc '%s' "$device")

      mkdir -p /tmp-iso
      mount -t "$fsType" /dev/root /tmp-iso
      mountFS tmpfs /iso size="$fsSize" tmpfs

      cp -r /tmp-iso/* /mnt-root/iso/

      umount /tmp-iso
      rmdir /tmp-iso
      continue
    fi

    if [ "$mountPoint" = / ] && [ "$device" = tmpfs ] && [ ! -z "$persistence" ]; then
        echo persistence...
        waitDevice "$persistence"
        echo enabling persistence...
        mountFS "$persistence" "$mountPoint" "$persistence_opt" "auto"
        continue
    fi

    mountFS "$device" "$mountPoint" "$options" "$fsType"
done

exec 3>&-


@postMountCommands@


# Emit a udev rule for /dev/root to prevent systemd from complaining.
if [ -e /mnt-root/iso ]; then
    eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=/mnt-root/iso)
else
    eval $(udevadm info --export --export-prefix=ROOT_ --device-id-of-file=$targetRoot)
fi
if [ "$ROOT_MAJOR" -a "$ROOT_MINOR" -a "$ROOT_MAJOR" != 0 ]; then
    mkdir -p /run/udev/rules.d
    echo 'ACTION=="add|change", SUBSYSTEM=="block", ENV{MAJOR}=="'$ROOT_MAJOR'", ENV{MINOR}=="'$ROOT_MINOR'", SYMLINK+="root"' > /run/udev/rules.d/61-dev-root-link.rules
fi


# Stop udevd.
udevadm control --exit

# Reset the logging file descriptors.
# Do this just before pkill, which will kill the tee process.
exec 1>&$logOutFd 2>&$logErrFd
eval "exec $logOutFd>&- $logErrFd>&-"

# Kill any remaining processes, just to be sure we're not taking any
# with us into stage 2. But keep storage daemons like unionfs-fuse.
#
# Storage daemons are distinguished by an @ in front of their command line:
# https://www.freedesktop.org/wiki/Software/systemd/RootStorageDaemons/
for pid in $(pgrep -v -f '^@'); do
    # Make sure we don't kill kernel processes, see #15226 and:
    # http://stackoverflow.com/questions/12213445/identifying-kernel-threads
    readlink "/proc/$pid/exe" &> /dev/null || continue
    # Try to avoid killing ourselves.
    [ $pid -eq $$ ] && continue
    kill -9 "$pid"
done

if test -n "$debug1mounts"; then fail; fi


# Restore /proc/sys/kernel/modprobe to its original value.
echo /sbin/modprobe > /proc/sys/kernel/modprobe


# Start stage 2.  `switch_root' deletes all files in the ramfs on the
# current root.  The path has to be valid in the chroot not outside.
if [ ! -e "$targetRoot/$stage2Init" ]; then
    stage2Check=${stage2Init}
    while [ "$stage2Check" != "${stage2Check%/*}" ] && [ ! -L "$targetRoot/$stage2Check" ]; do
        stage2Check=${stage2Check%/*}
    done
    if [ ! -L "$targetRoot/$stage2Check" ]; then
        echo "stage 2 init script ($targetRoot/$stage2Init) not found"
        fail
    fi
fi

mkdir -m 0755 -p $targetRoot/proc $targetRoot/sys $targetRoot/dev $targetRoot/run

mount --move /proc $targetRoot/proc
mount --move /sys $targetRoot/sys
mount --move /dev $targetRoot/dev
mount --move /run $targetRoot/run

exec env -i $(type -P switch_root) "$targetRoot" "$stage2Init"

fail # should never be reached