deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download

Message ID 20250305131142.2717692-1-cedric.hombourger@siemens.com
State Superseded, archived
Headers show
Series deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download | expand

Commit Message

Cedric Hombourger March 5, 2025, 1:11 p.m. UTC
Several calls to dpkg-deb are made for each single .deb file found in
downloads to parse individual fields. This approach is terribly slow
when a large amount of .deb files are found. Use apt-ftparchive to
produce an index of packages that were found and a simple awk script
to produce a (sorted) list of source package names and their versions.
Also avoid using sed to remove Epoch from the version when we are
trying to determine the name of the .dsc file: we instead use a simple
POSIX parameter expansion to remove everything up to the first colon

Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
---
 meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 27 deletions(-)

Comments

Jan Kiszka March 5, 2025, 1:57 p.m. UTC | #1
On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> Several calls to dpkg-deb are made for each single .deb file found in
> downloads to parse individual fields. This approach is terribly slow
> when a large amount of .deb files are found. Use apt-ftparchive to

Out of curiosity: What is roughly the amount of packages where this
inefficiency becomes visible?

> produce an index of packages that were found and a simple awk script
> to produce a (sorted) list of source package names and their versions.
> Also avoid using sed to remove Epoch from the version when we are
> trying to determine the name of the .dsc file: we instead use a simple
> POSIX parameter expansion to remove everything up to the first colon
> 
> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> ---
>  meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>  1 file changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
> index 7ebd057e..53ce4538 100644
> --- a/meta/classes/deb-dl-dir.bbclass
> +++ b/meta/classes/deb-dl-dir.bbclass
> @@ -5,23 +5,6 @@
>  
>  inherit repository
>  
> -is_not_part_of_current_build() {
> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
> -    # try to eliminate some debs that are not part of the current multiconfig
> -    # build using the below method.
> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
> -
> -    [ -z "${output}" ]
> -}
> -
>  debsrc_do_mounts() {
>      sudo -s <<EOSUDO
>      set -e
> @@ -54,16 +37,41 @@ debsrc_download() {
>      ( flock 9
>      set -e
>      printenv | grep -q BB_VERBOSE_LOGS && set -x
> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
> -        is_not_part_of_current_build "${package}" && continue
> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
> -        [ -n "$dscfile" ] && continue
> -
> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> +
> +    # Use apt-ftparchive to scan all .deb files found in the download directory
> +    # and produce an index that we can "parse" with awk. This is much faster
> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
> +    # index we need are:
> +    #
> +    #    Package: <binary-name>
> +    #    Version: <binary-version>
> +    #    Source: <source-name> (<source-version>)
> +    #
> +    # If Source is omitted, then <source-name>=<binary-name> and
> +    # if <source-version> is not specified then it is <binary-version>.
> +    # The awk script handles these optional fields. It looks for Size: as a
> +    # trigger to print the source,version tupple
> +
> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> +                   -a "${DISTRO_ARCH}" packages \
> +                   "${rootfs}/var/cache/apt/archives" \
> +    | awk '/^Package:/ { s=$2; }
> +           /^Version:/ { v=$2; next }
> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
> +           /^Size:/ { print s, v}' \
> +    | sort -u \
> +    | while read src version; do
> +        # Name of the .dsc file does not include Epoch, remove it before checking
> +        # if sources were already downloaded. Avoid using sed here to reduce the
> +        # number of processes being spawned by this function: we assume that the
> +        # version is correctly formatted and simply strip everything up to the
> +        # first colon
> +        dscname="${src}_${version#*:}.dsc"
> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> +            # use apt-get source to download sources in DEBSRCDIR
> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"

Can we rewrap this horribly long line at that chance?

> +        }
>      done
>      ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>  

Did you also consider using a python function for the content
processing? I'm not predicting that this will be faster or nicer or
whatever, just wondering if it might be while reading the above.

Jan
Cedric Hombourger March 5, 2025, 3:08 p.m. UTC | #2
On Wed, 2025-03-05 at 14:57 +0100, Jan Kiszka wrote:
> On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> > Several calls to dpkg-deb are made for each single .deb file found
> > in
> > downloads to parse individual fields. This approach is terribly
> > slow
> > when a large amount of .deb files are found. Use apt-ftparchive to
> 
> Out of curiosity: What is roughly the amount of packages where this
> inefficiency becomes visible?

That's a great question and I may not have a great answer though. I am
pleeding guilty for (1) sharing my downloads folder between builds and
(2) doing multiconfig builds. I therefore have ~4.9k .deb packages in
my downloads folder.

I would expect people start experiencing the cost of deb-src caching
with a few hundreds only. The current implementation was calling dpkg-
deb 5 times for each package + 1 call to sed for each + 1 call to find
to traverse the deb-src tree to check if the .dsc we want to download
happens to be already there.

> 
> > produce an index of packages that were found and a simple awk
> > script
> > to produce a (sorted) list of source package names and their
> > versions.
> > Also avoid using sed to remove Epoch from the version when we are
> > trying to determine the name of the .dsc file: we instead use a
> > simple
> > POSIX parameter expansion to remove everything up to the first
> > colon
> > 
> > Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> > ---
> >  meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++----------
> > ----
> >  1 file changed, 35 insertions(+), 27 deletions(-)
> > 
> > diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-
> > dir.bbclass
> > index 7ebd057e..53ce4538 100644
> > --- a/meta/classes/deb-dl-dir.bbclass
> > +++ b/meta/classes/deb-dl-dir.bbclass
> > @@ -5,23 +5,6 @@
> >  
> >  inherit repository
> >  
> > -is_not_part_of_current_build() {
> > -    local package="$( dpkg-deb --show --showformat '${Package}'
> > "${1}" )"
> > -    local arch="$( dpkg-deb --show --showformat '${Architecture}'
> > "${1}" )"
> > -    local version="$( dpkg-deb --show --showformat '${Version}'
> > "${1}" )"
> > -    # Since we are parsing all the debs in DEBDIR, we can to some
> > extend
> > -    # try to eliminate some debs that are not part of the current
> > multiconfig
> > -    # build using the below method.
> > -    local output="$( grep -xhs ".* status installed
> > ${package}:${arch} ${version}" \
> > -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> > -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> > -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> > -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> > -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1
> > )"
> > -
> > -    [ -z "${output}" ]
> > -}
> > -
> >  debsrc_do_mounts() {
> >      sudo -s <<EOSUDO
> >      set -e
> > @@ -54,16 +37,41 @@ debsrc_download() {
> >      ( flock 9
> >      set -e
> >      printenv | grep -q BB_VERBOSE_LOGS && set -x
> > -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -
> > iname '*\.deb' | while read package; do
> > -        is_not_part_of_current_build "${package}" && continue
> > -        local src="$( dpkg-deb --show --showformat
> > '${source:Package}' "${package}" )"
> > -        local version="$( dpkg-deb --show --showformat
> > '${source:Version}' "${package}" )"
> > -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-
> > 9]\+:/_/')"
> > -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -
> > name "${dscname}.dsc")
> > -        [ -n "$dscfile" ] && continue
> > -
> > -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs}
> > \
> > -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-
> > src/${1}/${2}" && apt-get -y --download-only --only-source source
> > "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> > +
> > +    # Use apt-ftparchive to scan all .deb files found in the
> > download directory
> > +    # and produce an index that we can "parse" with awk. This is
> > much faster
> > +    # than parsing each .deb file individually using dpkg-deb.
> > Lines from the
> > +    # index we need are:
> > +    #
> > +    #    Package: <binary-name>
> > +    #    Version: <binary-version>
> > +    #    Source: <source-name> (<source-version>)
> > +    #
> > +    # If Source is omitted, then <source-name>=<binary-name> and
> > +    # if <source-version> is not specified then it is <binary-
> > version>.
> > +    # The awk script handles these optional fields. It looks for
> > Size: as a
> > +    # trigger to print the source,version tupple
> > +
> > +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> > +                   -a "${DISTRO_ARCH}" packages \
> > +                   "${rootfs}/var/cache/apt/archives" \
> > +    | awk '/^Package:/ { s=$2; }
> > +           /^Version:/ { v=$2; next }
> > +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2,
> > length($3)-2) }
> > +           /^Size:/ { print s, v}' \
> > +    | sort -u \
> > +    | while read src version; do
> > +        # Name of the .dsc file does not include Epoch, remove it
> > before checking
> > +        # if sources were already downloaded. Avoid using sed here
> > to reduce the
> > +        # number of processes being spawned by this function: we
> > assume that the
> > +        # version is correctly formatted and simply strip
> > everything up to the
> > +        # first colon
> > +        dscname="${src}_${version#*:}.dsc"
> > +        [ -f
> > "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> > +            # use apt-get source to download sources in DEBSRCDIR
> > +            sudo -E chroot --userspec=$( id -u ):$( id -g )
> > ${rootfs} \
> > +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-
> > src/${1}/${2}" && apt-get -y --download-only --only-source source
> > "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> 
> Can we rewrap this horribly long line at that chance?
> 
> > +        }
> >      done
> >      ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
> >  
> 
> Did you also consider using a python function for the content
> processing? I'm not predicting that this will be faster or nicer or
> whatever, just wondering if it might be while reading the above.
> 
> Jan
>
Niedermayr, BENEDIKT March 5, 2025, 5:22 p.m. UTC | #3
On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
> Several calls to dpkg-deb are made for each single .deb file found in
> downloads to parse individual fields. This approach is terribly slow
> when a large amount of .deb files are found. Use apt-ftparchive to > produce an index of packages that were found and a simple awk script
> to produce a (sorted) list of source package names and their versions.
> Also avoid using sed to remove Epoch from the version when we are
> trying to determine the name of the .dsc file: we instead use a simple
> POSIX parameter expansion to remove everything up to the first colon
> 
> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
> ---
>   meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>   1 file changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
> index 7ebd057e..53ce4538 100644
> --- a/meta/classes/deb-dl-dir.bbclass
> +++ b/meta/classes/deb-dl-dir.bbclass
> @@ -5,23 +5,6 @@
>   
>   inherit repository
>   
> -is_not_part_of_current_build() {
> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
> -    # try to eliminate some debs that are not part of the current multiconfig
> -    # build using the below method.
> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
> -
> -    [ -z "${output}" ]
> -}
> -
>   debsrc_do_mounts() {
>       sudo -s <<EOSUDO
>       set -e
> @@ -54,16 +37,41 @@ debsrc_download() {
>       ( flock 9
>       set -e
>       printenv | grep -q BB_VERBOSE_LOGS && set -x
> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
> -        is_not_part_of_current_build "${package}" && continue
> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
> -        [ -n "$dscfile" ] && continue
> -
> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
Maybe a pointer to my previous patch [1] which addresses this as well 
but with a different motivation. Your patch would also fix a regression 
that has been introduced with mmdebstrap.

At least my patch is causing problems.

[1] https://groups.google.com/g/isar-users/c/IeORW6eiTxI

> +
> +    # Use apt-ftparchive to scan all .deb files found in the download directory
> +    # and produce an index that we can "parse" with awk. This is much faster
> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
> +    # index we need are:
> +    #
> +    #    Package: <binary-name>
> +    #    Version: <binary-version>
> +    #    Source: <source-name> (<source-version>)
> +    #
> +    # If Source is omitted, then <source-name>=<binary-name> and
> +    # if <source-version> is not specified then it is <binary-version>.
> +    # The awk script handles these optional fields. It looks for Size: as a
> +    # trigger to print the source,version tupple
> +
> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
> +                   -a "${DISTRO_ARCH}" packages \
> +                   "${rootfs}/var/cache/apt/archives" \
> +    | awk '/^Package:/ { s=$2; }
> +           /^Version:/ { v=$2; next }
> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
> +           /^Size:/ { print s, v}' \
> +    | sort -u \
> +    | while read src version; do
> +        # Name of the .dsc file does not include Epoch, remove it before checking
> +        # if sources were already downloaded. Avoid using sed here to reduce the
> +        # number of processes being spawned by this function: we assume that the
> +        # version is correctly formatted and simply strip everything up to the
> +        # first colon
> +        dscname="${src}_${version#*:}.dsc"
> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
> +            # use apt-get source to download sources in DEBSRCDIR
> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> +        }
>       done
>       ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>
Niedermayr, BENEDIKT March 5, 2025, 5:24 p.m. UTC | #4
On 05.03.25 18:22, 'Niedermayr, BENEDIKT' via isar-users wrote:
> On 05.03.25 14:11, 'Cedric Hombourger' via isar-users wrote:
>> Several calls to dpkg-deb are made for each single .deb file found in
>> downloads to parse individual fields. This approach is terribly slow
>> when a large amount of .deb files are found. Use apt-ftparchive to > produce an index of packages that were found and a simple awk script
>> to produce a (sorted) list of source package names and their versions.
>> Also avoid using sed to remove Epoch from the version when we are
>> trying to determine the name of the .dsc file: we instead use a simple
>> POSIX parameter expansion to remove everything up to the first colon
>>
>> Signed-off-by: Cedric Hombourger <cedric.hombourger@siemens.com>
>> ---
>>    meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++--------------
>>    1 file changed, 35 insertions(+), 27 deletions(-)
>>
>> diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
>> index 7ebd057e..53ce4538 100644
>> --- a/meta/classes/deb-dl-dir.bbclass
>> +++ b/meta/classes/deb-dl-dir.bbclass
>> @@ -5,23 +5,6 @@
>>    
>>    inherit repository
>>    
>> -is_not_part_of_current_build() {
>> -    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
>> -    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
>> -    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
>> -    # Since we are parsing all the debs in DEBDIR, we can to some extend
>> -    # try to eliminate some debs that are not part of the current multiconfig
>> -    # build using the below method.
>> -    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
>> -            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
>> -            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
>> -            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
>> -            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
>> -            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
>> -
>> -    [ -z "${output}" ]
>> -}
>> -
>>    debsrc_do_mounts() {
>>        sudo -s <<EOSUDO
>>        set -e
>> @@ -54,16 +37,41 @@ debsrc_download() {
>>        ( flock 9
>>        set -e
>>        printenv | grep -q BB_VERBOSE_LOGS && set -x
>> -    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
>> -        is_not_part_of_current_build "${package}" && continue
>> -        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
>> -        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
>> -        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
>> -        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
>> -        [ -n "$dscfile" ] && continue
>> -
>> -        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
>> -            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
> Maybe a pointer to my previous patch [1] which addresses this as well
> but with a different motivation. Your patch would also fix a regression
> that has been introduced with mmdebstrap.
> 
> At least my patch is causing problems.
> 
> [1] https://groups.google.com/g/isar-users/c/IeORW6eiTxI

Ok just saw this [1], so you might be already aware of it.

[1] https://groups.google.com/g/isar-users/c/8QstIaudyts

Regards,
Benedikt
> 
>> +
>> +    # Use apt-ftparchive to scan all .deb files found in the download directory
>> +    # and produce an index that we can "parse" with awk. This is much faster
>> +    # than parsing each .deb file individually using dpkg-deb. Lines from the
>> +    # index we need are:
>> +    #
>> +    #    Package: <binary-name>
>> +    #    Version: <binary-version>
>> +    #    Source: <source-name> (<source-version>)
>> +    #
>> +    # If Source is omitted, then <source-name>=<binary-name> and
>> +    # if <source-version> is not specified then it is <binary-version>.
>> +    # The awk script handles these optional fields. It looks for Size: as a
>> +    # trigger to print the source,version tupple
>> +
>> +    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
>> +                   -a "${DISTRO_ARCH}" packages \
>> +                   "${rootfs}/var/cache/apt/archives" \
>> +    | awk '/^Package:/ { s=$2; }
>> +           /^Version:/ { v=$2; next }
>> +           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
>> +           /^Size:/ { print s, v}' \
>> +    | sort -u \
>> +    | while read src version; do
>> +        # Name of the .dsc file does not include Epoch, remove it before checking
>> +        # if sources were already downloaded. Avoid using sed here to reduce the
>> +        # number of processes being spawned by this function: we assume that the
>> +        # version is correctly formatted and simply strip everything up to the
>> +        # first colon
>> +        dscname="${src}_${version#*:}.dsc"
>> +        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
>> +            # use apt-get source to download sources in DEBSRCDIR
>> +            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
>> +                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
>> +        }
>>        done
>>        ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
>>    
>
Srinuvasan Arjunan March 10, 2025, 11:06 a.m. UTC | #5
On Wednesday, March 5, 2025 at 6:42:05 PM UTC+5:30 Cedric Hombourger wrote:

Several calls to dpkg-deb are made for each single .deb file found in 
downloads to parse individual fields. This approach is terribly slow 
when a large amount of .deb files are found. Use apt-ftparchive to 
produce an index of packages that were found and a simple awk script 
to produce a (sorted) list of source package names and their versions. 
Also avoid using sed to remove Epoch from the version when we are 
trying to determine the name of the .dsc file: we instead use a simple 
POSIX parameter expansion to remove everything up to the first colon 

Signed-off-by: Cedric Hombourger <cedric.h...@siemens.com> 
--- 
meta/classes/deb-dl-dir.bbclass | 62 +++++++++++++++++++-------------- 
1 file changed, 35 insertions(+), 27 deletions(-) 

diff --git a/meta/classes/deb-dl-dir.bbclass 
b/meta/classes/deb-dl-dir.bbclass 
index 7ebd057e..53ce4538 100644 
--- a/meta/classes/deb-dl-dir.bbclass 
+++ b/meta/classes/deb-dl-dir.bbclass 
@@ -5,23 +5,6 @@ 

inherit repository 

-is_not_part_of_current_build() { 
- local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )" 
- local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )" 
- local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )" 
- # Since we are parsing all the debs in DEBDIR, we can to some extend 
- # try to eliminate some debs that are not part of the current multiconfig 
- # build using the below method. 
- local output="$( grep -xhs ".* status installed ${package}:${arch} 
${version}" \ 
- "${IMAGE_ROOTFS}"/var/log/dpkg.log \ 
- "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \ 
- "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \ 
- "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \ 
- "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )" 
- 
- [ -z "${output}" ] 
-} 
- 
debsrc_do_mounts() { 
sudo -s <<EOSUDO 
set -e 
@@ -54,16 +37,41 @@ debsrc_download() { 
( flock 9 
set -e 
printenv | grep -q BB_VERBOSE_LOGS && set -x 
- find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname 
'*\.deb' | while read package; do 
- is_not_part_of_current_build "${package}" && continue 
- local src="$( dpkg-deb --show --showformat '${source:Package}' 
"${package}" )" 
- local version="$( dpkg-deb --show --showformat '${source:Version}' 
"${package}" )" 
- local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')" 
- local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name 
"${dscname}.dsc") 
- [ -n "$dscfile" ] && continue 
- 
- sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \ 
- sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && 
apt-get -y --download-only --only-source source "$2"="$3" ' download-src 
"${rootfs_distro}" "${src}" "${version}" 
+ 
+ # Use apt-ftparchive to scan all .deb files found in the download 
directory 
+ # and produce an index that we can "parse" with awk. This is much faster 
+ # than parsing each .deb file individually using dpkg-deb. Lines from the 
+ # index we need are: 
+ # 
+ # Package: <binary-name> 
+ # Version: <binary-version> 
+ # Source: <source-name> (<source-version>) 
+ # 
+ # If Source is omitted, then <source-name>=<binary-name> and 
+ # if <source-version> is not specified then it is <binary-version>. 
+ # The awk script handles these optional fields. It looks for Size: as a 
+ # trigger to print the source,version tupple 
+ 
+ apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \ 
+ -a "${DISTRO_ARCH}" packages \


  Hi Cedric,

  I took this patch for my deb-src-caching issue [1], now i can able to 
download deb-src for bootstrap and image related packages
  only missing part is imager_install related packages, going to send the 
patches based on your patch.

  But here i found one issue for armfh arch base-apt builds in ISAR, the 
help2man and texinfo deb-src packages are missing
  because when we take the index using  apt-ftparchive --md5=no --sha1=no 
--sha256=no --sha512=no  -a "${DISTRO_ARCH}"
  we uses the -a ${DISTRO_ARCH}, in this case it is armfh, but help2man and 
texinfo packages are only available for amd64 arch (might
  be ISAR_CROSS_COMPILE configuration) not armhf, hence the index doesn't 
have those packages , due to this reason we are not able to
  download src packages for those packages.

   I would suggest we can remove -a "${DISTRO_ARCH}" option and anyhow we 
are getting final list with sort -u.
   Validated without -a option and it's working fine as expected.

   [1]: https://groups.google.com/g/isar-users/c/8QstIaudyts

 Please provide your thoughts?  


+ "${rootfs}/var/cache/apt/archives" \ 
+ | awk '/^Package:/ { s=$2; } 
+ /^Version:/ { v=$2; next } 
+ /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) } 
+ /^Size:/ { print s, v}' \ 
+ | sort -u \ 
+ | while read src version; do 
+ # Name of the .dsc file does not include Epoch, remove it before checking 
+ # if sources were already downloaded. Avoid using sed here to reduce the 
+ # number of processes being spawned by this function: we assume that the 
+ # version is correctly formatted and simply strip everything up to the 
+ # first colon 
+ dscname="${src}_${version#*:}.dsc" 
+ [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || { 
+ # use apt-get source to download sources in DEBSRCDIR 
+ sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \ 
+ sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && 
apt-get -y --download-only --only-source source "$2"="$3" ' download-src 
"${rootfs_distro}" "${src}" "${version}" 
+ } 
done 
) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"
Uladzimir Bely March 27, 2025, 10:34 a.m. UTC | #6
On Sat, 2025-03-22 at 07:15 +0100, Cedric Hombourger wrote:
> Changes since v1:
>  * v1 had is_not_part_of_current_build() removed. It turns out that
>    we better check what .deb files get imported in the apt cache
>    of our rootfs to make sure that we only try to fetch sources for
>    packages we know. This is now achieved with apt-cache dumpavail
>    to obtain a list of known source packages.
> 
> Cedric Hombourger (1):
>   deb-dl-dir: remove excessive calls to dpkg-deb in debsrc_download
> 
>  meta/classes/deb-dl-dir.bbclass | 78 +++++++++++++++++++++----------
> --
>  1 file changed, 51 insertions(+), 27 deletions(-)

Applied to next, thanks.

Patch

diff --git a/meta/classes/deb-dl-dir.bbclass b/meta/classes/deb-dl-dir.bbclass
index 7ebd057e..53ce4538 100644
--- a/meta/classes/deb-dl-dir.bbclass
+++ b/meta/classes/deb-dl-dir.bbclass
@@ -5,23 +5,6 @@ 
 
 inherit repository
 
-is_not_part_of_current_build() {
-    local package="$( dpkg-deb --show --showformat '${Package}' "${1}" )"
-    local arch="$( dpkg-deb --show --showformat '${Architecture}' "${1}" )"
-    local version="$( dpkg-deb --show --showformat '${Version}' "${1}" )"
-    # Since we are parsing all the debs in DEBDIR, we can to some extend
-    # try to eliminate some debs that are not part of the current multiconfig
-    # build using the below method.
-    local output="$( grep -xhs ".* status installed ${package}:${arch} ${version}" \
-            "${IMAGE_ROOTFS}"/var/log/dpkg.log \
-            "${SCHROOT_HOST_DIR}"/var/log/dpkg.log \
-            "${SCHROOT_TARGET_DIR}"/var/log/dpkg.log \
-            "${SCHROOT_HOST_DIR}"/tmp/dpkg_common.log \
-            "${SCHROOT_TARGET_DIR}"/tmp/dpkg_common.log | head -1 )"
-
-    [ -z "${output}" ]
-}
-
 debsrc_do_mounts() {
     sudo -s <<EOSUDO
     set -e
@@ -54,16 +37,41 @@  debsrc_download() {
     ( flock 9
     set -e
     printenv | grep -q BB_VERBOSE_LOGS && set -x
-    find "${rootfs}/var/cache/apt/archives/" -maxdepth 1 -type f -iname '*\.deb' | while read package; do
-        is_not_part_of_current_build "${package}" && continue
-        local src="$( dpkg-deb --show --showformat '${source:Package}' "${package}" )"
-        local version="$( dpkg-deb --show --showformat '${source:Version}' "${package}" )"
-        local dscname="$(echo ${src}_${version} | sed -e 's/_[0-9]\+:/_/')"
-        local dscfile=$(find "${DEBSRCDIR}"/"${rootfs_distro}" -name "${dscname}.dsc")
-        [ -n "$dscfile" ] && continue
-
-        sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
-            sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
+
+    # Use apt-ftparchive to scan all .deb files found in the download directory
+    # and produce an index that we can "parse" with awk. This is much faster
+    # than parsing each .deb file individually using dpkg-deb. Lines from the
+    # index we need are:
+    #
+    #    Package: <binary-name>
+    #    Version: <binary-version>
+    #    Source: <source-name> (<source-version>)
+    #
+    # If Source is omitted, then <source-name>=<binary-name> and
+    # if <source-version> is not specified then it is <binary-version>.
+    # The awk script handles these optional fields. It looks for Size: as a
+    # trigger to print the source,version tupple
+
+    apt-ftparchive --md5=no --sha1=no --sha256=no --sha512=no \
+                   -a "${DISTRO_ARCH}" packages \
+                   "${rootfs}/var/cache/apt/archives" \
+    | awk '/^Package:/ { s=$2; }
+           /^Version:/ { v=$2; next }
+           /^Source:/ { s=$2; if ($3 ~ /^\(/) v=substr($3, 2, length($3)-2) }
+           /^Size:/ { print s, v}' \
+    | sort -u \
+    | while read src version; do
+        # Name of the .dsc file does not include Epoch, remove it before checking
+        # if sources were already downloaded. Avoid using sed here to reduce the
+        # number of processes being spawned by this function: we assume that the
+        # version is correctly formatted and simply strip everything up to the
+        # first colon
+        dscname="${src}_${version#*:}.dsc"
+        [ -f "${DEBSRCDIR}"/"${rootfs_distro}"/"${src}"/"${dscname}" ] || {
+            # use apt-get source to download sources in DEBSRCDIR
+            sudo -E chroot --userspec=$( id -u ):$( id -g ) ${rootfs} \
+                sh -c ' mkdir -p "/deb-src/${1}/${2}" && cd "/deb-src/${1}/${2}" && apt-get -y --download-only --only-source source "$2"="$3" ' download-src "${rootfs_distro}" "${src}" "${version}"
+        }
     done
     ) 9>"${DEBSRCDIR}/${rootfs_distro}.lock"