deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download

Message ID 20250305131142.2717692-1-cedric.hombourger@siemens.com
State Superseded, archived
Headers show
Series deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download | expand

Commit Message

cedric.hombourger@siemens.com March 5, 2025, 1:11 p.m. UTC
Several calls to dpkg-deb are made for each single .deb file found in
downloads to parse individual fields. This approach is terribly slow
when a large amount of .deb files are found. Use apt-ftparchive to
produce an index of packages that were found and a simple awk script
to produce a (sorted) list of source package names and their versions.
Also avoid using sed to remove Epoch from the version when we are
trying to determine the name of the .dsc file: we instead use a simple
POSIX parameter expansion to remove everything up to the first colon

Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
---
 meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 27 deletions(-)

Comments

Jan Kiszka March 5, 2025, 1:57 p.m. UTC | #1
On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> Several calls to dpkg-deb are made for each single .deb file found in
> downloads to parse individual fields. This approach is terribly slow
> when a large amount of .deb files are found. Use apt-ftparchive to

Out of curiosity: What is roughly the amount of packages where this
inefficiency becomes visible?

> produce an index of packages that were found and a simple awk script
> to produce a (sorted) list of source package names and their versions.
> Also avoid using sed to remove Epoch from the version when we are
> trying to determine the name of the .dsc file: we instead use a simple
> POSIX parameter expansion to remove everything up to the first colon
> 
> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> ---
>  meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>  1 file changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
> index 7ebd057e..53ce4538 100644
> --- a/meta/classes/deb-dl-dir.bbclass
> +++ b/meta/classes/deb-dl-dir.bbclass
> @@ -5,23 +5,6 @@
>  
>  inherit repository
>  
> -is_not_part_of_current_build() {
> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
> -    # try to eliminate some debs that are not part of the current multiconfig
> -    # build using the below method.
> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
> -
> -    [ -z "${output}" ]
> -}
> -
>  debsrc_do_mounts() {
>      sudo -s <<EOSUDO
>      set -e
> @@ -54,16 +37,41 @@ debsrc_download() {
>      ( flock 9
>      set -e
>      printenv | grep -q BB_VERBOSE_LOGS && set -x
> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
> -        is_not_part_of_current_build "${package}" && continue
> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
> -        [ -n "$dscfile" ] && continue
> -
> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> +
> +    # Use apt-ftparchive to scan all .deb files found in the download directory
> +    # and produce an index that we can "parse" with awk. This is much faster
> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
> +    # index we need are:
> +    #
> +    #    Package: <binary-name>
> +    #    Version: <binary-version>
> +    #    Source: <source-name> (<source-version>)
> +    #
> +    # If Source is omitted, then <source-name>=<binary-name> and
> +    # if <source-version> is not specified then it is <binary-version>.
> +    # The awk script handles these optional fields. It looks for Size: as a
> +    # trigger to print the source,version tupple
> +
> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> +                   -a "${DISTRO_ARCH}" packages \
> +                   "${rootfs}/var/cache/apt/archives" \
> +    | awk '/^Package:/ { s=$2; }
> +           /^Version:/ { v=$2; next }
> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
> +           /^Size:/ { print s, v}' \
> +    | sort -u \
> +    | while read src version; do
> +        # Name of the .dsc file does not include Epoch, remove it before checking
> +        # if sources were already downloaded. Avoid using sed here to reduce the
> +        # number of processes being spawned by this function: we assume that the
> +        # version is correctly formatted and simply strip everything up to the
> +        # first colon
> +        dscname="${src}_${version#*:}.dsc"
> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> +            # use apt-get source to download sources in DEBSRCDIR
> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"

Can we rewrap this horribly long line at that chance?

> +        }
>      done
>      ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>  

Did you also consider using a python function for the content
processing? I'm not predicting that this will be faster or nicer or
whatever, just wondering if it might be while reading the above.

Jan
cedric.hombourger@siemens.com March 5, 2025, 3:08 p.m. UTC | #2
On Wed, 2025-03-05 at 14:57 +0100, Jan Kiszka wrote:
> On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> > Several calls to dpkg-deb are made for each single .deb file found
> > in
> > downloads to parse individual fields. This approach is terribly
> > slow
> > when a large amount of .deb files are found. Use apt-ftparchive to
> 
> Out of curiosity: What is roughly the amount of packages where this
> inefficiency becomes visible?

That's a great question and I may not have a great answer though. I am
pleeding guilty for (1) sharing my downloads folder between builds and
(2) doing multiconfig builds. I therefore have ~4.9k .deb packages in
my downloads folder.

I would expect people start experiencing the cost of deb-src caching
with a few hundreds only. The current implementation was calling dpkg-
deb 5 times for each package + 1 call to sed for each + 1 call to find
to traverse the deb-src tree to check if the .dsc we want to download
happens to be already there.

> 
> > produce an index of packages that were found and a simple awk
> > script
> > to produce a (sorted) list of source package names and their
> > versions.
> > Also avoid using sed to remove Epoch from the version when we are
> > trying to determine the name of the .dsc file: we instead use a
> > simple
> > POSIX parameter expansion to remove everything up to the first
> > colon
> > 
> > Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> > ---
> >  meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++----------
> > ----
> >  1 file changed, 35 insertions(+), 27 deletions(-)
> > 
> > diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-
> > dir.bbclass
> > index 7ebd057e..53ce4538 100644
> > --- a/meta/classes/deb-dl-dir.bbclass
> > +++ b/meta/classes/deb-dl-dir.bbclass
> > @@ -5,23 +5,6 @@
> >  
> >  inherit repository
> >  
> > -is_not_part_of_current_build() {
> > -    local package="$( dpkg-deb --show --showformat '${Package}'
> > "${1}" )"
> > -    local arch="$( dpkg-deb --show --showformat '${Architecture}'
> > "${1}" )"
> > -    local version="$( dpkg-deb --show --showformat '${Version}'
> > "${1}" )"
> > -    # Since we are parsing all the debs in DEBDIR, we can to some
> > extend
> > -    # try to eliminate some debs that are not part of the current
> > multiconfig
> > -    # build using the below method.
> > -    local output="$( grep -xhs ".* status installed
> > ${package}:${arch} ${version}" \
> > -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> > -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> > -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> > -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> > -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1
> > )"
> > -
> > -    [ -z "${output}" ]
> > -}
> > -
> >  debsrc_do_mounts() {
> >      sudo -s <<EOSUDO
> >      set -e
> > @@ -54,16 +37,41 @@ debsrc_download() {
> >      ( flock 9
> >      set -e
> >      printenv | grep -q BB_VERBOSE_LOGS && set -x
> > -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -
> > iname '*\.deb' | while read package; do
> > -        is_not_part_of_current_build "${package}" && continue
> > -        local src="$( dpkg-deb --show --showformat
> > '${source:Package}' "${package}" )"
> > -        local version="$( dpkg-deb --show --showformat
> > '${source:Version}' "${package}" )"
> > -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-
> > 9]\+:/_/')"
> > -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -
> > name "${dscname}.dsc")
> > -        [ -n "$dscfile" ] && continue
> > -
> > -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs}
> > \
> > -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-
> > src/${1}/${2}" && apt-get -y --download-only --only-source source
> > "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> > +
> > +    # Use apt-ftparchive to scan all .deb files found in the
> > download directory
> > +    # and produce an index that we can "parse" with awk. This is
> > much faster
> > +    # than parsing each .deb file individually using dpkg-deb.
> > Lines from the
> > +    # index we need are:
> > +    #
> > +    #    Package: <binary-name>
> > +    #    Version: <binary-version>
> > +    #    Source: <source-name> (<source-version>)
> > +    #
> > +    # If Source is omitted, then <source-name>=<binary-name> and
> > +    # if <source-version> is not specified then it is <binary-
> > version>.
> > +    # The awk script handles these optional fields. It looks for
> > Size: as a
> > +    # trigger to print the source,version tupple
> > +
> > +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> > +                   -a "${DISTRO_ARCH}" packages \
> > +                   "${rootfs}/var/cache/apt/archives" \
> > +    | awk '/^Package:/ { s=$2; }
> > +           /^Version:/ { v=$2; next }
> > +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2,
> > length($3)-2) }
> > +           /^Size:/ { print s, v}' \
> > +    | sort -u \
> > +    | while read src version; do
> > +        # Name of the .dsc file does not include Epoch, remove it
> > before checking
> > +        # if sources were already downloaded. Avoid using sed here
> > to reduce the
> > +        # number of processes being spawned by this function: we
> > assume that the
> > +        # version is correctly formatted and simply strip
> > everything up to the
> > +        # first colon
> > +        dscname="${src}_${version#*:}.dsc"
> > +        [ -f
> > "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> > +            # use apt-get source to download sources in DEBSRCDIR
> > +            sudo -E chroot --userspec=$( id -u ):$( id -g )
> > ${rootfs} \
> > +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-
> > src/${1}/${2}" && apt-get -y --download-only --only-source source
> > "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> 
> Can we rewrap this horribly long line at that chance?
> 
> > +        }
> >      done
> >      ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
> >  
> 
> Did you also consider using a python function for the content
> processing? I'm not predicting that this will be faster or nicer or
> whatever, just wondering if it might be while reading the above.
> 
> Jan
>
Niedermayr, BENEDIKT March 5, 2025, 5:22 p.m. UTC | #3
On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> Several calls to dpkg-deb are made for each single .deb file found in
> downloads to parse individual fields. This approach is terribly slow
> when a large amount of .deb files are found. Use apt-ftparchive to > produce an index of packages that were found and a simple awk script
> to produce a (sorted) list of source package names and their versions.
> Also avoid using sed to remove Epoch from the version when we are
> trying to determine the name of the .dsc file: we instead use a simple
> POSIX parameter expansion to remove everything up to the first colon
> 
> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> ---
>   meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>   1 file changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
> index 7ebd057e..53ce4538 100644
> --- a/meta/classes/deb-dl-dir.bbclass
> +++ b/meta/classes/deb-dl-dir.bbclass
> @@ -5,23 +5,6 @@
>   
>   inherit repository
>   
> -is_not_part_of_current_build() {
> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
> -    # try to eliminate some debs that are not part of the current multiconfig
> -    # build using the below method.
> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
> -
> -    [ -z "${output}" ]
> -}
> -
>   debsrc_do_mounts() {
>       sudo -s <<EOSUDO
>       set -e
> @@ -54,16 +37,41 @@ debsrc_download() {
>       ( flock 9
>       set -e
>       printenv | grep -q BB_VERBOSE_LOGS && set -x
> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
> -        is_not_part_of_current_build "${package}" && continue
> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
> -        [ -n "$dscfile" ] && continue
> -
> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
Maybe a pointer to my previous patch [1] which addresses this as well 
but with a different motivation. Your patch would also fix a regression 
that has been introduced with mmdebstrap.

At least my patch is causing problems.

[1] https://groups.google.com/g/isar-users/c/IeORW6eiTxI

> +
> +    # Use apt-ftparchive to scan all .deb files found in the download directory
> +    # and produce an index that we can "parse" with awk. This is much faster
> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
> +    # index we need are:
> +    #
> +    #    Package: <binary-name>
> +    #    Version: <binary-version>
> +    #    Source: <source-name> (<source-version>)
> +    #
> +    # If Source is omitted, then <source-name>=<binary-name> and
> +    # if <source-version> is not specified then it is <binary-version>.
> +    # The awk script handles these optional fields. It looks for Size: as a
> +    # trigger to print the source,version tupple
> +
> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> +                   -a "${DISTRO_ARCH}" packages \
> +                   "${rootfs}/var/cache/apt/archives" \
> +    | awk '/^Package:/ { s=$2; }
> +           /^Version:/ { v=$2; next }
> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
> +           /^Size:/ { print s, v}' \
> +    | sort -u \
> +    | while read src version; do
> +        # Name of the .dsc file does not include Epoch, remove it before checking
> +        # if sources were already downloaded. Avoid using sed here to reduce the
> +        # number of processes being spawned by this function: we assume that the
> +        # version is correctly formatted and simply strip everything up to the
> +        # first colon
> +        dscname="${src}_${version#*:}.dsc"
> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> +            # use apt-get source to download sources in DEBSRCDIR
> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> +        }
>       done
>       ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>
Niedermayr, BENEDIKT March 5, 2025, 5:24 p.m. UTC | #4
On 05.03.25 18:22, 'Niedermayr, BENEDIKT' via isar-users wrote:
> On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
>> Several calls to dpkg-deb are made for each single .deb file found in
>> downloads to parse individual fields. This approach is terribly slow
>> when a large amount of .deb files are found. Use apt-ftparchive to > produce an index of packages that were found and a simple awk script
>> to produce a (sorted) list of source package names and their versions.
>> Also avoid using sed to remove Epoch from the version when we are
>> trying to determine the name of the .dsc file: we instead use a simple
>> POSIX parameter expansion to remove everything up to the first colon
>>
>> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
>> ---
>>    meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>>    1 file changed, 35 insertions(+), 27 deletions(-)
>>
>> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
>> index 7ebd057e..53ce4538 100644
>> --- a/meta/classes/deb-dl-dir.bbclass
>> +++ b/meta/classes/deb-dl-dir.bbclass
>> @@ -5,23 +5,6 @@
>>    
>>    inherit repository
>>    
>> -is_not_part_of_current_build() {
>> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
>> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
>> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
>> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
>> -    # try to eliminate some debs that are not part of the current multiconfig
>> -    # build using the below method.
>> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
>> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
>> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
>> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
>> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
>> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
>> -
>> -    [ -z "${output}" ]
>> -}
>> -
>>    debsrc_do_mounts() {
>>        sudo -s <<EOSUDO
>>        set -e
>> @@ -54,16 +37,41 @@ debsrc_download() {
>>        ( flock 9
>>        set -e
>>        printenv | grep -q BB_VERBOSE_LOGS && set -x
>> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
>> -        is_not_part_of_current_build "${package}" && continue
>> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
>> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
>> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
>> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
>> -        [ -n "$dscfile" ] && continue
>> -
>> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
>> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> Maybe a pointer to my previous patch [1] which addresses this as well
> but with a different motivation. Your patch would also fix a regression
> that has been introduced with mmdebstrap.
> 
> At least my patch is causing problems.
> 
> [1] https://groups.google.com/g/isar-users/c/IeORW6eiTxI

Ok just saw this [1], so you might be already aware of it.

[1] https://groups.google.com/g/isar-users/c/8QstIaudyts

Regards,
Benedikt
> 
>> +
>> +    # Use apt-ftparchive to scan all .deb files found in the download directory
>> +    # and produce an index that we can "parse" with awk. This is much faster
>> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
>> +    # index we need are:
>> +    #
>> +    #    Package: <binary-name>
>> +    #    Version: <binary-version>
>> +    #    Source: <source-name> (<source-version>)
>> +    #
>> +    # If Source is omitted, then <source-name>=<binary-name> and
>> +    # if <source-version> is not specified then it is <binary-version>.
>> +    # The awk script handles these optional fields. It looks for Size: as a
>> +    # trigger to print the source,version tupple
>> +
>> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
>> +                   -a "${DISTRO_ARCH}" packages \
>> +                   "${rootfs}/var/cache/apt/archives" \
>> +    | awk '/^Package:/ { s=$2; }
>> +           /^Version:/ { v=$2; next }
>> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
>> +           /^Size:/ { print s, v}' \
>> +    | sort -u \
>> +    | while read src version; do
>> +        # Name of the .dsc file does not include Epoch, remove it before checking
>> +        # if sources were already downloaded. Avoid using sed here to reduce the
>> +        # number of processes being spawned by this function: we assume that the
>> +        # version is correctly formatted and simply strip everything up to the
>> +        # first colon
>> +        dscname="${src}_${version#*:}.dsc"
>> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
>> +            # use apt-get source to download sources in DEBSRCDIR
>> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
>> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
>> +        }
>>        done
>>        ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>>    
>
Srinuvasan Arjunan March 10, 2025, 11:06 a.m. UTC | #5
On Wednesday, March 5, 2025 at 6:42:05 PM UTC+5:30 Cedric Hombourger wrote:

Several calls to dpkg-deb are made for each single .deb file found in 
downloads to parse individual fields. This approach is terribly slow 
when a large amount of .deb files are found. Use apt-ftparchive to 
produce an index of packages that were found and a simple awk script 
to produce a (sorted) list of source package names and their versions. 
Also avoid using sed to remove Epoch from the version when we are 
trying to determine the name of the .dsc file: we instead use a simple 
POSIX parameter expansion to remove everything up to the first colon 

Signed-off-by: Cedric Hombourger <cedric.h...@siemens.com> 
--- 
meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++-------------- 
1 file changed, 35 insertions(+), 27 deletions(-) 

diff --git a/meta/classes/deb-dl-dir.bbclass 
b/meta/classes/deb-dl-dir.bbclass 
index 7ebd057e..53ce4538 100644 
--- a/meta/classes/deb-dl-dir.bbclass 
+++ b/meta/classes/deb-dl-dir.bbclass 
@@ -5,23 +5,6 @@ 

inherit repository 

-is_not_part_of_current_build() { 
- local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )" 
- local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )" 
- local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )" 
- # Since we are parsing all the debs in DEBDIR, we can to some extend 
- # try to eliminate some debs that are not part of the current multiconfig 
- # build using the below method. 
- local output="$( grep -xhs ".* status installed ${package}:${arch} 
${version}" \ 
- "${IMAGE_ROOTFS}"/var/log/dpkg.log \ 
- "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \ 
- "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \ 
- "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \ 
- "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )" 
- 
- [ -z "${output}" ] 
-} 
- 
debsrc_do_mounts() { 
sudo -s <<EOSUDO 
set -e 
@@ -54,16 +37,41 @@ debsrc_download() { 
( flock 9 
set -e 
printenv | grep -q BB_VERBOSE_LOGS && set -x 
- find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname 
'*\.deb' | while read package; do 
- is_not_part_of_current_build "${package}" && continue 
- local src="$( dpkg-deb --show --showformat '${source:Package}' 
"${package}" )" 
- local version="$( dpkg-deb --show --showformat '${source:Version}' 
"${package}" )" 
- local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')" 
- local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name 
"${dscname}.dsc") 
- [ -n "$dscfile" ] && continue 
- 
- sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \ 
- sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && 
apt-get -y --download-only --only-source source "$2"="$3" ' download-src 
"${rootfs_distro}" "${src}" "${version}" 
+ 
+ # Use apt-ftparchive to scan all .deb files found in the download 
directory 
+ # and produce an index that we can "parse" with awk. This is much faster 
+ # than parsing each .deb file individually using dpkg-deb. Lines from the 
+ # index we need are: 
+ # 
+ # Package: <binary-name> 
+ # Version: <binary-version> 
+ # Source: <source-name> (<source-version>) 
+ # 
+ # If Source is omitted, then <source-name>=<binary-name> and 
+ # if <source-version> is not specified then it is <binary-version>. 
+ # The awk script handles these optional fields. It looks for Size: as a 
+ # trigger to print the source,version tupple 
+ 
+ apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \ 
+ -a "${DISTRO_ARCH}" packages \


  Hi Cedric,

  I took this patch for my deb-src-caching issue [1], now i can able to 
download deb-src for bootstrap and image related packages
  only missing part is imager_install related packages, going to send the 
patches based on your patch.

  But here i found one issue for armfh arch base-apt builds in ISAR, the 
help2man and texinfo deb-src packages are missing
  because when we take the index using  apt-ftparchive --md5=no --sha1=no 
--sha256=no --sha512=no  -a "${DISTRO_ARCH}"
  we uses the -a ${DISTRO_ARCH}, in this case it is armfh, but help2man and 
texinfo packages are only available for amd64 arch (might
  be ISAR_CROSS_COMPILE configuration) not armhf, hence the index doesn't 
have those packages , due to this reason we are not able to
  download src packages for those packages.

   I would suggest we can remove -a "${DISTRO_ARCH}" option and anyhow we 
are getting final list with sort -u.
   Validated without -a option and it's working fine as expected.

   [1]: https://groups.google.com/g/isar-users/c/8QstIaudyts

 Please provide your thoughts?  


+ "${rootfs}/var/cache/apt/archives" \ 
+ | awk '/^Package:/ { s=$2; } 
+ /^Version:/ { v=$2; next } 
+ /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) } 
+ /^Size:/ { print s, v}' \ 
+ | sort -u \ 
+ | while read src version; do 
+ # Name of the .dsc file does not include Epoch, remove it before checking 
+ # if sources were already downloaded. Avoid using sed here to reduce the 
+ # number of processes being spawned by this function: we assume that the 
+ # version is correctly formatted and simply strip everything up to the 
+ # first colon 
+ dscname="${src}_${version#*:}.dsc" 
+ [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || { 
+ # use apt-get source to download sources in DEBSRCDIR 
+ sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \ 
+ sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && 
apt-get -y --download-only --only-source source "$2"="$3" ' download-src 
"${rootfs_distro}" "${src}" "${version}" 
+ } 
done 
) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
Uladzimir Bely March 27, 2025, 10:34 a.m. UTC | #6
On Sat, 2025-03-22 at 07:15 +0100, Cedric Hombourger wrote:
> Changes since v1:
>  * v1 had is_not_part_of_current_build() removed. It turns out that
>    we better check what .deb files get imported in the apt cache
>    of our rootfs to make sure that we only try to fetch sources for
>    packages we know. This is now achieved with apt-cache dumpavail
>    to obtain a list of known source packages.
> 
> Cedric Hombourger (1):
>   deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download
> 
>  meta/classes/deb-dl-dir.bbclass | 78 +++++++++++++++++++++----------
> --
>  1 file changed, 51 insertions(+), 27 deletions(-)

Applied to next, thanks.

Patch

diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
index 7ebd057e..53ce4538 100644
--- a/meta/classes/deb-dl-dir.bbclass
+++ b/meta/classes/deb-dl-dir.bbclass
@@ -5,23 +5,6 @@ 
 
 inherit repository
 
-is_not_part_of_current_build() {
-    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
-    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
-    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
-    # Since we are parsing all the debs in DEBDIR, we can to some extend
-    # try to eliminate some debs that are not part of the current multiconfig
-    # build using the below method.
-    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
-            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
-            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
-            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
-            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
-            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
-
-    [ -z "${output}" ]
-}
-
 debsrc_do_mounts() {
     sudo -s <<EOSUDO
     set -e
@@ -54,16 +37,41 @@  debsrc_download() {
     ( flock 9
     set -e
     printenv | grep -q BB_VERBOSE_LOGS && set -x
-    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
-        is_not_part_of_current_build "${package}" && continue
-        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
-        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
-        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
-        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
-        [ -n "$dscfile" ] && continue
-
-        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
-            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
+
+    # Use apt-ftparchive to scan all .deb files found in the download directory
+    # and produce an index that we can "parse" with awk. This is much faster
+    # than parsing each .deb file individually using dpkg-deb. Lines from the
+    # index we need are:
+    #
+    #    Package: <binary-name>
+    #    Version: <binary-version>
+    #    Source: <source-name> (<source-version>)
+    #
+    # If Source is omitted, then <source-name>=<binary-name> and
+    # if <source-version> is not specified then it is <binary-version>.
+    # The awk script handles these optional fields. It looks for Size: as a
+    # trigger to print the source,version tupple
+
+    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
+                   -a "${DISTRO_ARCH}" packages \
+                   "${rootfs}/var/cache/apt/archives" \
+    | awk '/^Package:/ { s=$2; }
+           /^Version:/ { v=$2; next }
+           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
+           /^Size:/ { print s, v}' \
+    | sort -u \
+    | while read src version; do
+        # Name of the .dsc file does not include Epoch, remove it before checking
+        # if sources were already downloaded. Avoid using sed here to reduce the
+        # number of processes being spawned by this function: we assume that the
+        # version is correctly formatted and simply strip everything up to the
+        # first colon
+        dscname="${src}_${version#*:}.dsc"
+        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
+            # use apt-get source to download sources in DEBSRCDIR
+            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
+                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
+        }
     done
     ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"