initramfs: Add missing umounts after generation

Message ID 1646de44-c244-43b3-ad57-b3bf702608b0@siemens.com
State Superseded, archived
Headers show
Series initramfs: Add missing umounts after generation | expand

Commit Message

Jan Kiszka Sept. 30, 2024, 7:42 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

Failing to unmount what was mounted via rootfs_do_mounts can cause
troubles on rebuilds.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 meta/classes/image.bbclass     | 21 ++-------------------
 meta/classes/initramfs.bbclass |  2 ++
 meta/classes/rootfs.bbclass    | 25 +++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 19 deletions(-)

Comments

Felix Moessbauer Oct. 1, 2024, 7:38 a.m. UTC | #1
On Mon, 2024-09-30 at 21:42 +0200, 'Jan Kiszka' via isar-users wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Failing to unmount what was mounted via rootfs_do_mounts can cause
> troubles on rebuilds.

Why is that not caught by the cleanup handler in [1]?
I'm just wondering as the mounting and unmounting does not happen in
the same task, hence we will run into problems on partial rebuilds.

[1]https://github.com/ilbers/isar/blob/51462e82e4a108c9c94ba4099465f5a19a7125a6/meta/classes/isar-events.bbclass#L53

Felix

> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  meta/classes/image.bbclass     | 21 ++-------------------
>  meta/classes/initramfs.bbclass |  2 ++
>  meta/classes/rootfs.bbclass    | 25 +++++++++++++++++++++++++
>  3 files changed, 29 insertions(+), 19 deletions(-)
> 
> diff --git a/meta/classes/image.bbclass b/meta/classes/image.bbclass
> index c29d9e26..1414a3ee 100644
> --- a/meta/classes/image.bbclass
> +++ b/meta/classes/image.bbclass
> @@ -393,6 +393,8 @@ python do_deploy() {
>  addtask deploy before do_build after do_image
>  
>  do_rootfs_finalize() {
> +    rootfs_do_umounts
> +
>      sudo -s <<'EOSUDO'
>          set -e
>  
> @@ -406,25 +408,6 @@ do_rootfs_finalize() {
>                  -maxdepth 1 -name 'qemu-*-static' -type f -delete
>          fi
>  
> -        mountpoint -q '${ROOTFSDIR}/isar-apt' && \
> -            umount '${ROOTFSDIR}/isar-apt' && \
> -            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/isar-apt
> -
> -        mountpoint -q '${ROOTFSDIR}/base-apt' && \
> -            umount '${ROOTFSDIR}/base-apt' && \
> -            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/base-apt
> -
> -        mountpoint -q '${ROOTFSDIR}/dev/pts' && \
> -            umount '${ROOTFSDIR}/dev/pts'
> -        mountpoint -q '${ROOTFSDIR}/dev/shm' && \
> -            umount '${ROOTFSDIR}/dev/shm'
> -        mountpoint -q '${ROOTFSDIR}/dev' && \
> -            umount '${ROOTFSDIR}/dev'
> -        mountpoint -q '${ROOTFSDIR}/proc' && \
> -            umount '${ROOTFSDIR}/proc'
> -        mountpoint -q '${ROOTFSDIR}/sys' && \
> -            umount '${ROOTFSDIR}/sys'
> -
>          if [ -e "${ROOTFSDIR}/etc/apt/sources-list" ]; then
>              mv "${ROOTFSDIR}/etc/apt/sources-list" \
>                  "${ROOTFSDIR}/etc/apt/sources.list.d/bootstrap.list"
> diff --git a/meta/classes/initramfs.bbclass
> b/meta/classes/initramfs.bbclass
> index 6886b95a..42013356 100644
> --- a/meta/classes/initramfs.bbclass
> +++ b/meta/classes/initramfs.bbclass
> @@ -45,6 +45,8 @@ do_generate_initramfs() {
>            update-initramfs -u -v ;  \
>          fi'
>  
> +    rootfs_do_umounts
> +
>      if [ ! -e "${INITRAMFS_ROOTFS}/initrd.img" ]; then
>          bberror "No initramfs was found after generation!"
>      fi
> diff --git a/meta/classes/rootfs.bbclass
> b/meta/classes/rootfs.bbclass
> index f0abd795..0b4ea061 100644
> --- a/meta/classes/rootfs.bbclass
> +++ b/meta/classes/rootfs.bbclass
> @@ -65,6 +65,31 @@ rootfs_do_mounts() {
>  EOSUDO
>  }
>  
> +rootfs_do_umounts() {
> +    sudo -s <<'EOSUDO'
> +        set -e
> +        mountpoint -q '${ROOTFSDIR}/isar-apt' && \
> +            umount '${ROOTFSDIR}/isar-apt' && \
> +            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/isar-apt
> +
> +        mountpoint -q '${ROOTFSDIR}/base-apt' && \
> +            umount '${ROOTFSDIR}/base-apt' && \
> +            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/base-apt
> +
> +        mountpoint -q '${ROOTFSDIR}/dev/pts' && \
> +            umount '${ROOTFSDIR}/dev/pts'
> +        mountpoint -q '${ROOTFSDIR}/dev/shm' && \
> +            umount '${ROOTFSDIR}/dev/shm'
> +        mountpoint -q '${ROOTFSDIR}/dev' && \
> +            umount '${ROOTFSDIR}/dev'
> +        mountpoint -q '${ROOTFSDIR}/proc' && \
> +            umount '${ROOTFSDIR}/proc'
> +        mountpoint -q '${ROOTFSDIR}/sys' && \
> +            umount '${ROOTFSDIR}/sys'
> +
> +EOSUDO
> +}
> +
>  rootfs_do_qemu() {
>      if [ '${@repr(d.getVar('ROOTFS_ARCH') ==
> d.getVar('HOST_ARCH'))}' = 'False' ]
>      then
> -- 
> 2.43.0
>
Jan Kiszka Oct. 1, 2024, 10:04 a.m. UTC | #2
On 01.10.24 09:38, Moessbauer, Felix (T CED OES-DE) wrote:
> On Mon, 2024-09-30 at 21:42 +0200, 'Jan Kiszka' via isar-users wrote:
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> Failing to unmount what was mounted via rootfs_do_mounts can cause
>> troubles on rebuilds.
> 
> Why is that not caught by the cleanup handler in [1]?
> I'm just wondering as the mounting and unmounting does not happen in
> the same task, hence we will run into problems on partial rebuilds.
> 
> [1]https://github.com/ilbers/isar/blob/51462e82e4a108c9c94ba4099465f5a19a7125a6/meta/classes/isar-events.bbclass#L53

I didn't analyze that yet, but it is a valid question why our diagnostic
framework failed as well.

Jan
Florian Bezdeka Oct. 2, 2024, 8:31 a.m. UTC | #3
On Tue, 2024-10-01 at 12:04 +0200, 'Jan Kiszka' via isar-users wrote:
> On 01.10.24 09:38, Moessbauer, Felix (T CED OES-DE) wrote:
> > On Mon, 2024-09-30 at 21:42 +0200, 'Jan Kiszka' via isar-users wrote:
> > > From: Jan Kiszka <jan.kiszka@siemens.com>
> > > 
> > > Failing to unmount what was mounted via rootfs_do_mounts can cause
> > > troubles on rebuilds.
> > 
> > Why is that not caught by the cleanup handler in [1]?
> > I'm just wondering as the mounting and unmounting does not happen in
> > the same task, hence we will run into problems on partial rebuilds.
> > 
> > [1]https://github.com/ilbers/isar/blob/51462e82e4a108c9c94ba4099465f5a19a7125a6/meta/classes/isar-events.bbclass#L53
> 
> I didn't analyze that yet, but it is a valid question why our diagnostic
> framework failed as well.

Taken from the cleanup handler referenced above:

    with open('/proc/mounts') as f:
        for line in f.readlines():
            if basepath in line:
                bb.debug(1, '%s left mounted, unmounting...' % line.split()[1])
                subprocess.call(
                    ["sudo", "umount", line.split()[1]],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.DEVNULL,
                )

Problems:

- bb.debug() will not make it to stdout as long as the log level is not
  set accordingly. By default the level is not set to debug, so nobody
  will notice.

- The cleanup handler is executed in "build" context, not "task" 
  context. (Naming might be wrong bitbake wise...). With that there is
  no log file where bb.debug() will write to. Nobody will notice
  (again...).

- subprocess.call() might fail silently, there is no error checking.
  In my case: "target is busy.\n"

- The cleanup handler is called twice. Not sure if that is correct...

At the end our "(u)mount mistake detection mechanism" seems broken. As
first step I would vote for a bb.debug() -> bb.warn() migration. Manual
inspection of stdout still required...

Florian

> 
> Jan
> 
> -- 
> Siemens AG, Technology
> Linux Expert Center
Jan Kiszka Oct. 2, 2024, 10:28 a.m. UTC | #4
On 02.10.24 10:31, Florian Bezdeka wrote:
> On Tue, 2024-10-01 at 12:04 +0200, 'Jan Kiszka' via isar-users wrote:
>> On 01.10.24 09:38, Moessbauer, Felix (T CED OES-DE) wrote:
>>> On Mon, 2024-09-30 at 21:42 +0200, 'Jan Kiszka' via isar-users wrote:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> Failing to unmount what was mounted via rootfs_do_mounts can cause
>>>> troubles on rebuilds.
>>>
>>> Why is that not caught by the cleanup handler in [1]?
>>> I'm just wondering as the mounting and unmounting does not happen in
>>> the same task, hence we will run into problems on partial rebuilds.
>>>
>>> [1]https://github.com/ilbers/isar/blob/51462e82e4a108c9c94ba4099465f5a19a7125a6/meta/classes/isar-events.bbclass#L53
>>
>> I didn't analyze that yet, but it is a valid question why our diagnostic
>> framework failed as well.
> 
> Taken from the cleanup handler referenced above:
> 
>     with open('/proc/mounts') as f:
>         for line in f.readlines():
>             if basepath in line:
>                 bb.debug(1, '%s left mounted, unmounting...' % line.split()[1])
>                 subprocess.call(
>                     ["sudo", "umount", line.split()[1]],
>                     stdout=subprocess.DEVNULL,
>                     stderr=subprocess.DEVNULL,
>                 )
> 
> Problems:
> 
> - bb.debug() will not make it to stdout as long as the log level is not
>   set accordingly. By default the level is not set to debug, so nobody
>   will notice.
> 
> - The cleanup handler is executed in "build" context, not "task" 
>   context. (Naming might be wrong bitbake wise...). With that there is
>   no log file where bb.debug() will write to. Nobody will notice
>   (again...).
> 
> - subprocess.call() might fail silently, there is no error checking.
>   In my case: "target is busy.\n"
> 
> - The cleanup handler is called twice. Not sure if that is correct...
> 
> At the end our "(u)mount mistake detection mechanism" seems broken. As
> first step I would vote for a bb.debug() -> bb.warn() migration. Manual
> inspection of stdout still required...
> 

Ack. And a test for this handler would be good.

Lazy umount was apparently dropped without review/testing of the safety
mechanisms and the code that is involved in mounting. We all assumed to
world would be fine while things de facto work not that well. On top
comes that most users are in containers and only notice those issues on
rebuilds or due to further special factors.

Jan
Florian Bezdeka Oct. 2, 2024, 3:10 p.m. UTC | #5
On Wed, 2024-10-02 at 12:28 +0200, Jan Kiszka wrote:
> On 02.10.24 10:31, Florian Bezdeka wrote:
> > On Tue, 2024-10-01 at 12:04 +0200, 'Jan Kiszka' via isar-users wrote:
> > > On 01.10.24 09:38, Moessbauer, Felix (T CED OES-DE) wrote:
> > > > On Mon, 2024-09-30 at 21:42 +0200, 'Jan Kiszka' via isar-users wrote:
> > > > > From: Jan Kiszka <jan.kiszka@siemens.com>
> > > > > 
> > > > > Failing to unmount what was mounted via rootfs_do_mounts can cause
> > > > > troubles on rebuilds.
> > > > 
> > > > Why is that not caught by the cleanup handler in [1]?
> > > > I'm just wondering as the mounting and unmounting does not happen in
> > > > the same task, hence we will run into problems on partial rebuilds.
> > > > 
> > > > [1]https://github.com/ilbers/isar/blob/51462e82e4a108c9c94ba4099465f5a19a7125a6/meta/classes/isar-events.bbclass#L53
> > > 
> > > I didn't analyze that yet, but it is a valid question why our diagnostic
> > > framework failed as well.
> > 
> > Taken from the cleanup handler referenced above:
> > 
> >     with open('/proc/mounts') as f:
> >         for line in f.readlines():
> >             if basepath in line:
> >                 bb.debug(1, '%s left mounted, unmounting...' % line.split()[1])
> >                 subprocess.call(
> >                     ["sudo", "umount", line.split()[1]],
> >                     stdout=subprocess.DEVNULL,
> >                     stderr=subprocess.DEVNULL,
> >                 )
> > 
> > Problems:
> > 
> > - bb.debug() will not make it to stdout as long as the log level is not
> >   set accordingly. By default the level is not set to debug, so nobody
> >   will notice.
> > 
> > - The cleanup handler is executed in "build" context, not "task" 
> >   context. (Naming might be wrong bitbake wise...). With that there is
> >   no log file where bb.debug() will write to. Nobody will notice
> >   (again...).
> > 
> > - subprocess.call() might fail silently, there is no error checking.
> >   In my case: "target is busy.\n"
> > 
> > - The cleanup handler is called twice. Not sure if that is correct...
> > 
> > At the end our "(u)mount mistake detection mechanism" seems broken. As
> > first step I would vote for a bb.debug() -> bb.warn() migration. Manual
> > inspection of stdout still required...
> > 
> 
> Ack. And a test for this handler would be good.

Technically someone should go and revert 
72ec986d9cd3 ("events: Do not warn on left mounts by default")
then.

https://github.com/ilbers/isar/commit/72ec986d9cd3a3913e8592554456d5968d6b659a

> 
> Lazy umount was apparently dropped without review/testing of the safety
> mechanisms and the code that is involved in mounting. We all assumed to
> world would be fine while things de facto work not that well. On top
> comes that most users are in containers and only notice those issues on
> rebuilds or due to further special factors.
> 
> Jan
> 
> -- 
> Siemens AG, Technology
> Linux Expert Center
Baurzhan Ismagulov Oct. 3, 2024, 4:15 p.m. UTC | #6
Hello Jan,

On 2024-10-02 12:28, 'Jan Kiszka' via isar-users wrote:
> Lazy umount was apparently dropped without review/testing of the safety
> mechanisms and the code that is involved in mounting. We all assumed to
> world would be fine while things de facto work not that well. On top
> comes that most users are in containers and only notice those issues on
> rebuilds or due to further special factors.

I'll share a bit of background on this: This is not about assumptions, we knew
there were issues (and worked on them -- e.g., [1]). The occurrence rate was
several times per year. After migrating to sbuild, mounts moved into
per-package schroots, and the errors became even more rare. Debugging cycles of
weeks to months are not practical because you can dump the mounts or similar
stuff, but the necessary number of iterations is not reachable, not in
reasonable time.

Before sending the lazy umount removal patch, we've checked the archives why
"umount -l", "umount || true" were introduced but couldn't reproduce any of
those (I personally found the problem descriptions and root cause analysis
quite spare and often not actionable -- I think we can improve here). Even now,
the issue seems to occur more frequently downstream than in meta-isar CI. Given
that the downstreams have control of meta-isar revision and can also apply
patches, the question for me was whether to keep the problems under the rug or
work with downstreams to analyze them. If you have specific suggestions what we
could do better the next time, please share.

For this specific occurrence, if we don't have the races with package building
left after the sbuild introduction (this is why mounts and umounts were
asymmetric with buildchroot), the latest patches might be enough without
central mount / umount tracking or moving the mounts to individual schroots
altogether. We'll need to understand the failure mode because otherwise proper
testing is challenging.

1. https://lists.isar-build.org/isar-users/20210421145855.66257-1-amikan@ilbers.de/#b

With kind regards,
Baurzhan
Jan Kiszka Oct. 4, 2024, 7:47 a.m. UTC | #7
On 03.10.24 18:15, Baurzhan Ismagulov wrote:
> Hello Jan,
> 
> On 2024-10-02 12:28, 'Jan Kiszka' via isar-users wrote:
>> Lazy umount was apparently dropped without review/testing of the safety
>> mechanisms and the code that is involved in mounting. We all assumed to
>> world would be fine while things de facto work not that well. On top
>> comes that most users are in containers and only notice those issues on
>> rebuilds or due to further special factors.
> 
> I'll share a bit of background on this: This is not about assumptions, we knew
> there were issues (and worked on them -- e.g., [1]). The occurrence rate was
> several times per year. After migrating to sbuild, mounts moved into
> per-package schroots, and the errors became even more rare. Debugging cycles of
> weeks to months are not practical because you can dump the mounts or similar
> stuff, but the necessary number of iterations is not reachable, not in
> reasonable time.
> 
> Before sending the lazy umount removal patch, we've checked the archives why
> "umount -l", "umount || true" were introduced but couldn't reproduce any of
> those (I personally found the problem descriptions and root cause analysis
> quite spare and often not actionable -- I think we can improve here). Even now,
> the issue seems to occur more frequently downstream than in meta-isar CI. Given
> that the downstreams have control of meta-isar revision and can also apply
> patches, the question for me was whether to keep the problems under the rug or
> work with downstreams to analyze them. If you have specific suggestions what we
> could do better the next time, please share.
> 
> For this specific occurrence, if we don't have the races with package building
> left after the sbuild introduction (this is why mounts and umounts were
> asymmetric with buildchroot), the latest patches might be enough without
> central mount / umount tracking or moving the mounts to individual schroots
> altogether. We'll need to understand the failure mode because otherwise proper
> testing is challenging.
> 
> 1. https://lists.isar-build.org/isar-users/20210421145855.66257-1-amikan@ilbers.de/#b
> 

There are multiple things:
 - mounts left behind - several, easily to review manually once it was
   clear that those still exist (I wasn't expecting that)
 - left-behind mounts detection - apparently broken, not properly tested
 - races and other conditions actually stumbling over those mounts -
   rare, specifically with the few cases of isar itself; and that's why
   the lazy umount removal appeared to be safe

Jan

Patch

diff --git a/meta/classes/image.bbclass b/meta/classes/image.bbclass
index c29d9e26..1414a3ee 100644
--- a/meta/classes/image.bbclass
+++ b/meta/classes/image.bbclass
@@ -393,6 +393,8 @@  python do_deploy() {
 addtask deploy before do_build after do_image
 
 do_rootfs_finalize() {
+    rootfs_do_umounts
+
     sudo -s <<'EOSUDO'
         set -e
 
@@ -406,25 +408,6 @@  do_rootfs_finalize() {
                 -maxdepth 1 -name 'qemu-*-static' -type f -delete
         fi
 
-        mountpoint -q '${ROOTFSDIR}/isar-apt' && \
-            umount '${ROOTFSDIR}/isar-apt' && \
-            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/isar-apt
-
-        mountpoint -q '${ROOTFSDIR}/base-apt' && \
-            umount '${ROOTFSDIR}/base-apt' && \
-            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/base-apt
-
-        mountpoint -q '${ROOTFSDIR}/dev/pts' && \
-            umount '${ROOTFSDIR}/dev/pts'
-        mountpoint -q '${ROOTFSDIR}/dev/shm' && \
-            umount '${ROOTFSDIR}/dev/shm'
-        mountpoint -q '${ROOTFSDIR}/dev' && \
-            umount '${ROOTFSDIR}/dev'
-        mountpoint -q '${ROOTFSDIR}/proc' && \
-            umount '${ROOTFSDIR}/proc'
-        mountpoint -q '${ROOTFSDIR}/sys' && \
-            umount '${ROOTFSDIR}/sys'
-
         if [ -e "${ROOTFSDIR}/etc/apt/sources-list" ]; then
             mv "${ROOTFSDIR}/etc/apt/sources-list" \
                 "${ROOTFSDIR}/etc/apt/sources.list.d/bootstrap.list"
diff --git a/meta/classes/initramfs.bbclass b/meta/classes/initramfs.bbclass
index 6886b95a..42013356 100644
--- a/meta/classes/initramfs.bbclass
+++ b/meta/classes/initramfs.bbclass
@@ -45,6 +45,8 @@  do_generate_initramfs() {
           update-initramfs -u -v ;  \
         fi'
 
+    rootfs_do_umounts
+
     if [ ! -e "${INITRAMFS_ROOTFS}/initrd.img" ]; then
         bberror "No initramfs was found after generation!"
     fi
diff --git a/meta/classes/rootfs.bbclass b/meta/classes/rootfs.bbclass
index f0abd795..0b4ea061 100644
--- a/meta/classes/rootfs.bbclass
+++ b/meta/classes/rootfs.bbclass
@@ -65,6 +65,31 @@  rootfs_do_mounts() {
 EOSUDO
 }
 
+rootfs_do_umounts() {
+    sudo -s <<'EOSUDO'
+        set -e
+        mountpoint -q '${ROOTFSDIR}/isar-apt' && \
+            umount '${ROOTFSDIR}/isar-apt' && \
+            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/isar-apt
+
+        mountpoint -q '${ROOTFSDIR}/base-apt' && \
+            umount '${ROOTFSDIR}/base-apt' && \
+            rmdir --ignore-fail-on-non-empty ${ROOTFSDIR}/base-apt
+
+        mountpoint -q '${ROOTFSDIR}/dev/pts' && \
+            umount '${ROOTFSDIR}/dev/pts'
+        mountpoint -q '${ROOTFSDIR}/dev/shm' && \
+            umount '${ROOTFSDIR}/dev/shm'
+        mountpoint -q '${ROOTFSDIR}/dev' && \
+            umount '${ROOTFSDIR}/dev'
+        mountpoint -q '${ROOTFSDIR}/proc' && \
+            umount '${ROOTFSDIR}/proc'
+        mountpoint -q '${ROOTFSDIR}/sys' && \
+            umount '${ROOTFSDIR}/sys'
+
+EOSUDO
+}
+
 rootfs_do_qemu() {
     if [ '${@repr(d.getVar('ROOTFS_ARCH') == d.getVar('HOST_ARCH'))}' = 'False' ]
     then