The primary reason for this
is that chroot() is escapable, where as pivot_root() is not.
In the context of LXC, the pivot_root is done after switching into a
private filesystem namespace (CLONE-NEWNS), so only the process doing
the pivot and its children are affected, not the system as a whole.
package main
import (
"fmt"
"os"
"os/exec"
"syscall"
)
func main() {
switch os.Args[1] {
case "run":
parent()
case "child":
child()
default:
panic("wat should I do")
}
}
func parent() {
cmd := exec.Command("/proc/self/exe", append([]string{"child"}, os.Args[2:]...)...)
cmd.SysProcAttr = &syscall.SysProcAttr{
Cloneflags: syscall.CLONE_NEWUTS | syscall.CLONE_NEWPID | syscall.CLONE_NEWNS,
}
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
fmt.Println("ERROR", err)
os.Exit(1)
}
}
func child() {
must(syscall.Mount("rootfs", "rootfs", "", syscall.MS_BIND, ""))
must(os.MkdirAll("rootfs/oldrootfs", 0700))
must(syscall.PivotRoot("rootfs", "rootfs/oldrootfs"))
must(os.Chdir("/"))
cmd := exec.Command(os.Args[2], os.Args[3:]...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
fmt.Println("ERROR", err)
os.Exit(1)
}
}
func must(err error) {
if err != nil {
panic(err)
}
}
=The final two lines are the important bit, they tell the OS to move the current directory at /
to rootfs/oldrootfs
, and to swap the new rootfs directory to /
. After the pivotroot
call is complete, the / directory in the container will refer to the rootfs. (The bind mount call is needed to satisfy some requirements of the pivotroot
command — the OS requires that pivotroot
be used to swap two filesystems that are not part of the same tree, which bind mounting the rootfs to itself achieves. Yes, it’s pretty silly).=