isar-sstate: reupload utilized files older than max-age

Message ID 20240723122703.1210290-1-tobias.schaffner@siemens.com
State Under Review
Headers show
Series isar-sstate: reupload utilized files older than max-age | expand

Commit Message

Tobias Schaffner July 23, 2024, 12:27 p.m. UTC
Currently, the Isar-sstate script deletes all files older than max-age
during a clean call, regardless of whether they are still in use. Given
that S3 buckets do not offer a means to update timestamps other than
through a reupload, this commit introduces a change to reupload all
files utilized by the current build if they are older than max-age
during an isar-sstate upload call.

Signed-off-by: Tobias Schaffner <tobias.schaffner@siemens.com>
---
 scripts/isar-sstate | 57 ++++++++++++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 19 deletions(-)

Comments

MOESSBAUER, Felix July 24, 2024, 12:38 p.m. UTC | #1
On Tue, 2024-07-23 at 14:27 +0200, Tobias Schaffner wrote:
> Currently, the Isar-sstate script deletes all files older than max-
> age
> during a clean call, regardless of whether they are still in use.
> Given
> that S3 buckets do not offer a means to update timestamps other than
> through a reupload, this commit introduces a change to reupload all
> files utilized by the current build if they are older than max-age
> during an isar-sstate upload call.

Hi, I'm wondering if it is sufficient to just re-upload the signature,
but not the file itself. Otherwise we "punish" good caching by a lot of
traffic between the build servers and S3.

CC'ing Adriaan.

Felix

> 
> Signed-off-by: Tobias Schaffner <tobias.schaffner@siemens.com>
> ---
>  scripts/isar-sstate | 57 ++++++++++++++++++++++++++++++-------------
> --
>  1 file changed, 38 insertions(+), 19 deletions(-)
> 
> diff --git a/scripts/isar-sstate b/scripts/isar-sstate
> index 4ea38bc8..a60f50dd 100755
> --- a/scripts/isar-sstate
> +++ b/scripts/isar-sstate
> @@ -32,6 +32,11 @@ and supports three remote backends (filesystem,
> http/webdav, AWS S3).
>  The `upload` command pushes the contents of a local sstate cache to
> the
>  remote location, uploading all files that don't already exist on the
> remote.
>  
> +`--max-age` specifies after which time artifacts in the cache should
> be
> +refreshed. Files older than this age will be reuploaded to update
> its timestamp.
> +This value should be chosen to be smaller than the clean max-age to
> ensure that
> +the artifacts are refreshed before they are cleaned.
> +
>  ### clean
>  
>  The `clean` command deletes old artifacts from the remote cache. It
> takes two
> @@ -179,6 +184,17 @@ StampsRegex = re.compile(
>     
> r"(.*/)?(?P<arch>[^/]+)/(?P<pn>[^/]+)/([^/]+)\.do_(?P<task>[^/]+)\.(?
> P<suffix>sigdata)\.(?P<hash>[0-9a-f]{64})"
>  )
>  
> +def convert_duration_string_to_seconds(x):
> +    seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w':
> 604800}
> +    m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> +    if m is None:
> +        return None
> +    unit = m.group(2)
> +    if unit is None:
> +        print("WARNING: MAX_AGE without unit, assuming 'days'")
> +        unit = 'd'
> +    return int(m.group(1)) * seconds_per_unit[unit]
> +
>  class SstateTargetBase(object):
>      def __init__(self, path, cached=False):
>          """Constructor
> @@ -598,7 +614,7 @@ def arguments():
>          '-v', '--verbose', default=False, action='store_true')
>      parser.add_argument(
>          '--max-age', type=str, default='1d',
> -        help="clean: remove archive files older than MAX_AGE (a
> number followed by w|d|h|m|s)")
> +        help="clean/upload: remove/reupload archive files older than
> MAX_AGE (a number followed by w|d|h|m|s)")
>      parser.add_argument(
>          '--max-sig-age', type=str, default=None,
>          help="clean: remove siginfo files older than MAX_SIG_AGE
> (defaults to MAX_AGE)")
> @@ -632,7 +648,7 @@ def arguments():
>      return args
>  
>  
> -def sstate_upload(source, target, verbose, **kwargs):
> +def sstate_upload(source, target, verbose, max_age="1d", **kwargs):
>      if not os.path.isdir(source):
>          print(f"WARNING: source {source} does not exist. Not
> uploading.")
>          return 0
> @@ -640,23 +656,37 @@ def sstate_upload(source, target, verbose,
> **kwargs):
>          print(f"WARNING: target {target} does not exist and could
> not be created. Not uploading.")
>          return 0
>  
> +    print(f"INFO: scanning {target}")
> +    all_files = target.list_all()
> +
> +    def target_file_present(file_path):
> +        for file in all_files:
> +            if file.path == file_path:
> +                return file
> +
>      print(f"INFO: uploading {source} to {target}")
>      os.chdir(source)
> -    upload, exists = [], []
> +    upload, exists, update = [], [], []
>      for subdir, dirs, files in os.walk('.'):
>          target_dirs = subdir.split('/')[1:]
>          for f in files:
>              file_path = (('/'.join(target_dirs) + '/') if
> len(target_dirs) > 0 else '') + f
> -            if target.exists(file_path):
> +            target_file = target_file_present(file_path)
> +            if target_file:
>                  if verbose:
>                      print(f"[EXISTS] {file_path}")
>                  exists.append(file_path)
> +                if target_file.age >
> convert_duration_string_to_seconds(max_age):
> +                    update.append((file_path, target_dirs))
> +                    if verbose:
> +                        print(f"[UPDATE] {file_path}")
>              else:
>                  upload.append((file_path, target_dirs))
> -    upload_gb = (sum([os.path.getsize(f[0]) for f in upload]) /
> 1024.0 / 1024.0 / 1024.0)
> +    upload_gb = (sum([os.path.getsize(f[0]) for f in (upload +
> update)]) / 1024.0 / 1024.0 / 1024.0)
>      print(f"INFO: uploading {len(upload)} files ({upload_gb:.02f}
> GB)")
>      print(f"INFO: {len(exists)} files already present on target")
> -    for file_path, target_dirs in upload:
> +    print(f"INFO: {len(update)} files will be refreshed")
> +    for file_path, target_dirs in upload + update:
>          if verbose:
>              print(f"[UPLOAD] {file_path}")
>          target.mkdir('/'.join(target_dirs))
> @@ -665,24 +695,13 @@ def sstate_upload(source, target, verbose,
> **kwargs):
>  
>  
>  def sstate_clean(target, max_age, max_sig_age, verbose, **kwargs):
> -    def convert_to_seconds(x):
> -        seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400,
> 'w': 604800}
> -        m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> -        if m is None:
> -            return None
> -        unit = m.group(2)
> -        if unit is None:
> -            print("WARNING: MAX_AGE without unit, assuming 'days'")
> -            unit = 'd'
> -        return int(m.group(1)) * seconds_per_unit[unit]
> -
> -    max_age_seconds = convert_to_seconds(max_age)
> +    max_age_seconds = convert_duration_string_to_seconds(max_age)
>      if max_age_seconds is None:
>          print(f"ERROR: cannot parse MAX_AGE '{max_age}', needs to be
> a number followed by w|d|h|m|s")
>          return 1
>      if max_sig_age is None:
>          max_sig_age = max_age
> -    max_sig_age_seconds = max(max_age_seconds,
> convert_to_seconds(max_sig_age))
> +    max_sig_age_seconds = max(max_age_seconds,
> convert_duration_string_to_seconds(max_sig_age))
>  
>      if not target.exists():
>          print(f"WARNING: cannot access target {target}. Nothing to
> clean.")
Tobias Schaffner July 25, 2024, 7:18 a.m. UTC | #2
On 24.07.24 14:38, Moessbauer, Felix (T CED OES-DE) wrote:
> On Tue, 2024-07-23 at 14:27 +0200, Tobias Schaffner wrote:
>> Currently, the Isar-sstate script deletes all files older than max-
>> age
>> during a clean call, regardless of whether they are still in use.
>> Given
>> that S3 buckets do not offer a means to update timestamps other than
>> through a reupload, this commit introduces a change to reupload all
>> files utilized by the current build if they are older than max-age
>> during an isar-sstate upload call.
> 
> Hi, I'm wondering if it is sufficient to just re-upload the signature,
> but not the file itself. Otherwise we "punish" good caching by a lot of
> traffic between the build servers and S3.

It depends on your CI structure but this does not necessarily introduce 
more traffic. The Idea is to do a re-upload when the file would be cleaned.

A the moment a common pattern for isar-sstate usage is:
clean -> download needed artifacts that are available -> rebuild 
artifact x that is still needed but was cleaned -> upload x

This change allows you to:
download needed artifacts that are available -> reupload x that would be 
cleaned -> clean

In both cases x will have to be uploaded.

Best,
Tobias

> CC'ing Adriaan.
> 
> Felix
> 
>>
>> Signed-off-by: Tobias Schaffner <tobias.schaffner@siemens.com>
>> ---
>>   scripts/isar-sstate | 57 ++++++++++++++++++++++++++++++-------------
>> --
>>   1 file changed, 38 insertions(+), 19 deletions(-)
>>
>> diff --git a/scripts/isar-sstate b/scripts/isar-sstate
>> index 4ea38bc8..a60f50dd 100755
>> --- a/scripts/isar-sstate
>> +++ b/scripts/isar-sstate
>> @@ -32,6 +32,11 @@ and supports three remote backends (filesystem,
>> http/webdav, AWS S3).
>>   The `upload` command pushes the contents of a local sstate cache to
>> the
>>   remote location, uploading all files that don't already exist on the
>> remote.
>>   
>> +`--max-age` specifies after which time artifacts in the cache should
>> be
>> +refreshed. Files older than this age will be reuploaded to update
>> its timestamp.
>> +This value should be chosen to be smaller than the clean max-age to
>> ensure that
>> +the artifacts are refreshed before they are cleaned.
>> +
>>   ### clean
>>   
>>   The `clean` command deletes old artifacts from the remote cache. It
>> takes two
>> @@ -179,6 +184,17 @@ StampsRegex = re.compile(
>>      
>> r"(.*/)?(?P<arch>[^/]+)/(?P<pn>[^/]+)/([^/]+)\.do_(?P<task>[^/]+)\.(?
>> P<suffix>sigdata)\.(?P<hash>[0-9a-f]{64})"
>>   )
>>   
>> +def convert_duration_string_to_seconds(x):
>> +    seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w':
>> 604800}
>> +    m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
>> +    if m is None:
>> +        return None
>> +    unit = m.group(2)
>> +    if unit is None:
>> +        print("WARNING: MAX_AGE without unit, assuming 'days'")
>> +        unit = 'd'
>> +    return int(m.group(1)) * seconds_per_unit[unit]
>> +
>>   class SstateTargetBase(object):
>>       def __init__(self, path, cached=False):
>>           """Constructor
>> @@ -598,7 +614,7 @@ def arguments():
>>           '-v', '--verbose', default=False, action='store_true')
>>       parser.add_argument(
>>           '--max-age', type=str, default='1d',
>> -        help="clean: remove archive files older than MAX_AGE (a
>> number followed by w|d|h|m|s)")
>> +        help="clean/upload: remove/reupload archive files older than
>> MAX_AGE (a number followed by w|d|h|m|s)")
>>       parser.add_argument(
>>           '--max-sig-age', type=str, default=None,
>>           help="clean: remove siginfo files older than MAX_SIG_AGE
>> (defaults to MAX_AGE)")
>> @@ -632,7 +648,7 @@ def arguments():
>>       return args
>>   
>>   
>> -def sstate_upload(source, target, verbose, **kwargs):
>> +def sstate_upload(source, target, verbose, max_age="1d", **kwargs):
>>       if not os.path.isdir(source):
>>           print(f"WARNING: source {source} does not exist. Not
>> uploading.")
>>           return 0
>> @@ -640,23 +656,37 @@ def sstate_upload(source, target, verbose,
>> **kwargs):
>>           print(f"WARNING: target {target} does not exist and could
>> not be created. Not uploading.")
>>           return 0
>>   
>> +    print(f"INFO: scanning {target}")
>> +    all_files = target.list_all()
>> +
>> +    def target_file_present(file_path):
>> +        for file in all_files:
>> +            if file.path == file_path:
>> +                return file
>> +
>>       print(f"INFO: uploading {source} to {target}")
>>       os.chdir(source)
>> -    upload, exists = [], []
>> +    upload, exists, update = [], [], []
>>       for subdir, dirs, files in os.walk('.'):
>>           target_dirs = subdir.split('/')[1:]
>>           for f in files:
>>               file_path = (('/'.join(target_dirs) + '/') if
>> len(target_dirs) > 0 else '') + f
>> -            if target.exists(file_path):
>> +            target_file = target_file_present(file_path)
>> +            if target_file:
>>                   if verbose:
>>                       print(f"[EXISTS] {file_path}")
>>                   exists.append(file_path)
>> +                if target_file.age >
>> convert_duration_string_to_seconds(max_age):
>> +                    update.append((file_path, target_dirs))
>> +                    if verbose:
>> +                        print(f"[UPDATE] {file_path}")
>>               else:
>>                   upload.append((file_path, target_dirs))
>> -    upload_gb = (sum([os.path.getsize(f[0]) for f in upload]) /
>> 1024.0 / 1024.0 / 1024.0)
>> +    upload_gb = (sum([os.path.getsize(f[0]) for f in (upload +
>> update)]) / 1024.0 / 1024.0 / 1024.0)
>>       print(f"INFO: uploading {len(upload)} files ({upload_gb:.02f}
>> GB)")
>>       print(f"INFO: {len(exists)} files already present on target")
>> -    for file_path, target_dirs in upload:
>> +    print(f"INFO: {len(update)} files will be refreshed")
>> +    for file_path, target_dirs in upload + update:
>>           if verbose:
>>               print(f"[UPLOAD] {file_path}")
>>           target.mkdir('/'.join(target_dirs))
>> @@ -665,24 +695,13 @@ def sstate_upload(source, target, verbose,
>> **kwargs):
>>   
>>   
>>   def sstate_clean(target, max_age, max_sig_age, verbose, **kwargs):
>> -    def convert_to_seconds(x):
>> -        seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400,
>> 'w': 604800}
>> -        m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
>> -        if m is None:
>> -            return None
>> -        unit = m.group(2)
>> -        if unit is None:
>> -            print("WARNING: MAX_AGE without unit, assuming 'days'")
>> -            unit = 'd'
>> -        return int(m.group(1)) * seconds_per_unit[unit]
>> -
>> -    max_age_seconds = convert_to_seconds(max_age)
>> +    max_age_seconds = convert_duration_string_to_seconds(max_age)
>>       if max_age_seconds is None:
>>           print(f"ERROR: cannot parse MAX_AGE '{max_age}', needs to be
>> a number followed by w|d|h|m|s")
>>           return 1
>>       if max_sig_age is None:
>>           max_sig_age = max_age
>> -    max_sig_age_seconds = max(max_age_seconds,
>> convert_to_seconds(max_sig_age))
>> +    max_sig_age_seconds = max(max_age_seconds,
>> convert_duration_string_to_seconds(max_sig_age))
>>   
>>       if not target.exists():
>>           print(f"WARNING: cannot access target {target}. Nothing to
>> clean.")
>
MOESSBAUER, Felix July 25, 2024, 8:20 a.m. UTC | #3
On Thu, 2024-07-25 at 09:18 +0200, Tobias Schaffner wrote:
> On 24.07.24 14:38, Moessbauer, Felix (T CED OES-DE) wrote:
> > On Tue, 2024-07-23 at 14:27 +0200, Tobias Schaffner wrote:
> > > Currently, the Isar-sstate script deletes all files older than
> > > max-
> > > age
> > > during a clean call, regardless of whether they are still in use.
> > > Given
> > > that S3 buckets do not offer a means to update timestamps other
> > > than
> > > through a reupload, this commit introduces a change to reupload
> > > all
> > > files utilized by the current build if they are older than max-
> > > age
> > > during an isar-sstate upload call.
> > 
> > Hi, I'm wondering if it is sufficient to just re-upload the
> > signature,
> > but not the file itself. Otherwise we "punish" good caching by a
> > lot of
> > traffic between the build servers and S3.
> 
> It depends on your CI structure but this does not necessarily
> introduce 
> more traffic. The Idea is to do a re-upload when the file would be
> cleaned.

Ok, got it. So the new logic behaves similar to a LRU cache.

> 
> A the moment a common pattern for isar-sstate usage is:
> clean -> download needed artifacts that are available -> rebuild 
> artifact x that is still needed but was cleaned -> upload x
> 
> This change allows you to:
> download needed artifacts that are available -> reupload x that would
> be 
> cleaned -> clean

This sounds reasonable. We can already apply this patch to a couple of
our CI systems to see how it behaves in practice.

Felix

> 
> In both cases x will have to be uploaded.
> 
> Best,
> Tobias
> 
> > CC'ing Adriaan.
> > 
> > Felix
> > 
> > > 
> > > Signed-off-by: Tobias Schaffner <tobias.schaffner@siemens.com>
> > > ---
> > >   scripts/isar-sstate | 57 ++++++++++++++++++++++++++++++--------
> > > -----
> > > --
> > >   1 file changed, 38 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/scripts/isar-sstate b/scripts/isar-sstate
> > > index 4ea38bc8..a60f50dd 100755
> > > --- a/scripts/isar-sstate
> > > +++ b/scripts/isar-sstate
> > > @@ -32,6 +32,11 @@ and supports three remote backends
> > > (filesystem,
> > > http/webdav, AWS S3).
> > >   The `upload` command pushes the contents of a local sstate
> > > cache to
> > > the
> > >   remote location, uploading all files that don't already exist
> > > on the
> > > remote.
> > >   
> > > +`--max-age` specifies after which time artifacts in the cache
> > > should
> > > be
> > > +refreshed. Files older than this age will be reuploaded to
> > > update
> > > its timestamp.
> > > +This value should be chosen to be smaller than the clean max-age
> > > to
> > > ensure that
> > > +the artifacts are refreshed before they are cleaned.
> > > +
> > >   ### clean
> > >   
> > >   The `clean` command deletes old artifacts from the remote
> > > cache. It
> > > takes two
> > > @@ -179,6 +184,17 @@ StampsRegex = re.compile(
> > >      
> > > r"(.*/)?(?P<arch>[^/]+)/(?P<pn>[^/]+)/([^/]+)\.do_(?P<task>[^/]+)
> > > \.(?
> > > P<suffix>sigdata)\.(?P<hash>[0-9a-f]{64})"
> > >   )
> > >   
> > > +def convert_duration_string_to_seconds(x):
> > > +    seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400,
> > > 'w':
> > > 604800}
> > > +    m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> > > +    if m is None:
> > > +        return None
> > > +    unit = m.group(2)
> > > +    if unit is None:
> > > +        print("WARNING: MAX_AGE without unit, assuming 'days'")
> > > +        unit = 'd'
> > > +    return int(m.group(1)) * seconds_per_unit[unit]
> > > +
> > >   class SstateTargetBase(object):
> > >       def __init__(self, path, cached=False):
> > >           """Constructor
> > > @@ -598,7 +614,7 @@ def arguments():
> > >           '-v', '--verbose', default=False, action='store_true')
> > >       parser.add_argument(
> > >           '--max-age', type=str, default='1d',
> > > -        help="clean: remove archive files older than MAX_AGE (a
> > > number followed by w|d|h|m|s)")
> > > +        help="clean/upload: remove/reupload archive files older
> > > than
> > > MAX_AGE (a number followed by w|d|h|m|s)")
> > >       parser.add_argument(
> > >           '--max-sig-age', type=str, default=None,
> > >           help="clean: remove siginfo files older than
> > > MAX_SIG_AGE
> > > (defaults to MAX_AGE)")
> > > @@ -632,7 +648,7 @@ def arguments():
> > >       return args
> > >   
> > >   
> > > -def sstate_upload(source, target, verbose, **kwargs):
> > > +def sstate_upload(source, target, verbose, max_age="1d",
> > > **kwargs):
> > >       if not os.path.isdir(source):
> > >           print(f"WARNING: source {source} does not exist. Not
> > > uploading.")
> > >           return 0
> > > @@ -640,23 +656,37 @@ def sstate_upload(source, target, verbose,
> > > **kwargs):
> > >           print(f"WARNING: target {target} does not exist and
> > > could
> > > not be created. Not uploading.")
> > >           return 0
> > >   
> > > +    print(f"INFO: scanning {target}")
> > > +    all_files = target.list_all()
> > > +
> > > +    def target_file_present(file_path):
> > > +        for file in all_files:
> > > +            if file.path == file_path:
> > > +                return file
> > > +
> > >       print(f"INFO: uploading {source} to {target}")
> > >       os.chdir(source)
> > > -    upload, exists = [], []
> > > +    upload, exists, update = [], [], []
> > >       for subdir, dirs, files in os.walk('.'):
> > >           target_dirs = subdir.split('/')[1:]
> > >           for f in files:
> > >               file_path = (('/'.join(target_dirs) + '/') if
> > > len(target_dirs) > 0 else '') + f
> > > -            if target.exists(file_path):
> > > +            target_file = target_file_present(file_path)
> > > +            if target_file:
> > >                   if verbose:
> > >                       print(f"[EXISTS] {file_path}")
> > >                   exists.append(file_path)
> > > +                if target_file.age >
> > > convert_duration_string_to_seconds(max_age):
> > > +                    update.append((file_path, target_dirs))
> > > +                    if verbose:
> > > +                        print(f"[UPDATE] {file_path}")
> > >               else:
> > >                   upload.append((file_path, target_dirs))
> > > -    upload_gb = (sum([os.path.getsize(f[0]) for f in upload]) /
> > > 1024.0 / 1024.0 / 1024.0)
> > > +    upload_gb = (sum([os.path.getsize(f[0]) for f in (upload +
> > > update)]) / 1024.0 / 1024.0 / 1024.0)
> > >       print(f"INFO: uploading {len(upload)} files
> > > ({upload_gb:.02f}
> > > GB)")
> > >       print(f"INFO: {len(exists)} files already present on
> > > target")
> > > -    for file_path, target_dirs in upload:
> > > +    print(f"INFO: {len(update)} files will be refreshed")
> > > +    for file_path, target_dirs in upload + update:
> > >           if verbose:
> > >               print(f"[UPLOAD] {file_path}")
> > >           target.mkdir('/'.join(target_dirs))
> > > @@ -665,24 +695,13 @@ def sstate_upload(source, target, verbose,
> > > **kwargs):
> > >   
> > >   
> > >   def sstate_clean(target, max_age, max_sig_age, verbose,
> > > **kwargs):
> > > -    def convert_to_seconds(x):
> > > -        seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd':
> > > 86400,
> > > 'w': 604800}
> > > -        m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> > > -        if m is None:
> > > -            return None
> > > -        unit = m.group(2)
> > > -        if unit is None:
> > > -            print("WARNING: MAX_AGE without unit, assuming
> > > 'days'")
> > > -            unit = 'd'
> > > -        return int(m.group(1)) * seconds_per_unit[unit]
> > > -
> > > -    max_age_seconds = convert_to_seconds(max_age)
> > > +    max_age_seconds =
> > > convert_duration_string_to_seconds(max_age)
> > >       if max_age_seconds is None:
> > >           print(f"ERROR: cannot parse MAX_AGE '{max_age}', needs
> > > to be
> > > a number followed by w|d|h|m|s")
> > >           return 1
> > >       if max_sig_age is None:
> > >           max_sig_age = max_age
> > > -    max_sig_age_seconds = max(max_age_seconds,
> > > convert_to_seconds(max_sig_age))
> > > +    max_sig_age_seconds = max(max_age_seconds,
> > > convert_duration_string_to_seconds(max_sig_age))
> > >   
> > >       if not target.exists():
> > >           print(f"WARNING: cannot access target {target}. Nothing
> > > to
> > > clean.")
> >
Schmidt, Adriaan July 25, 2024, 9:34 a.m. UTC | #4
Tobias Schaffner,Dienstag, 23. Juli 2024 14:27:
> Currently, the Isar-sstate script deletes all files older than max-age
> during a clean call, regardless of whether they are still in use. Given

Keep in mind that this is deliberate! The problem is that we're building
against a changing apt mirror, so we have outdated package lists,
and our cache artifacts become useless.

Now, if we're building against snapshots, that problem goes away, and
then it would be good to clean considering latest-use time instead of
upload time.
In that case I guess all we need is a switch to the "upload" command,
that makes it "refresh" the files if they already exist, either by re-uploading,
or with some other tricks (I found some hints that copying the file in S3,
and then copying it back might reset the timestamp... would have to try
that out).

Adriaan

> that S3 buckets do not offer a means to update timestamps other than
> through a reupload, this commit introduces a change to reupload all
> files utilized by the current build if they are older than max-age
> during an isar-sstate upload call.
> 
> Signed-off-by: Tobias Schaffner <tobias.schaffner@siemens.com>
> ---
>  scripts/isar-sstate | 57 ++++++++++++++++++++++++++++++---------------
>  1 file changed, 38 insertions(+), 19 deletions(-)
> 
> diff --git a/scripts/isar-sstate b/scripts/isar-sstate
> index 4ea38bc8..a60f50dd 100755
> --- a/scripts/isar-sstate
> +++ b/scripts/isar-sstate
> @@ -32,6 +32,11 @@ and supports three remote backends (filesystem,
> http/webdav, AWS S3).
>  The `upload` command pushes the contents of a local sstate cache to the
>  remote location, uploading all files that don't already exist on the remote.
> 
> +`--max-age` specifies after which time artifacts in the cache should be
> +refreshed. Files older than this age will be reuploaded to update its timestamp.
> +This value should be chosen to be smaller than the clean max-age to ensure that
> +the artifacts are refreshed before they are cleaned.
> +
>  ### clean
> 
>  The `clean` command deletes old artifacts from the remote cache. It takes two
> @@ -179,6 +184,17 @@ StampsRegex = re.compile(
> 
> r"(.*/)?(?P<arch>[^/]+)/(?P<pn>[^/]+)/([^/]+)\.do_(?P<task>[^/]+)\.(?P<suffix>sigdata)
> \.(?P<hash>[0-9a-f]{64})"
>  )
> 
> +def convert_duration_string_to_seconds(x):
> +    seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w': 604800}
> +    m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> +    if m is None:
> +        return None
> +    unit = m.group(2)
> +    if unit is None:
> +        print("WARNING: MAX_AGE without unit, assuming 'days'")
> +        unit = 'd'
> +    return int(m.group(1)) * seconds_per_unit[unit]
> +
>  class SstateTargetBase(object):
>      def __init__(self, path, cached=False):
>          """Constructor
> @@ -598,7 +614,7 @@ def arguments():
>          '-v', '--verbose', default=False, action='store_true')
>      parser.add_argument(
>          '--max-age', type=str, default='1d',
> -        help="clean: remove archive files older than MAX_AGE (a number followed by
> w|d|h|m|s)")
> +        help="clean/upload: remove/reupload archive files older than MAX_AGE (a
> number followed by w|d|h|m|s)")
>      parser.add_argument(
>          '--max-sig-age', type=str, default=None,
>          help="clean: remove siginfo files older than MAX_SIG_AGE (defaults to
> MAX_AGE)")
> @@ -632,7 +648,7 @@ def arguments():
>      return args
> 
> 
> -def sstate_upload(source, target, verbose, **kwargs):
> +def sstate_upload(source, target, verbose, max_age="1d", **kwargs):
>      if not os.path.isdir(source):
>          print(f"WARNING: source {source} does not exist. Not uploading.")
>          return 0
> @@ -640,23 +656,37 @@ def sstate_upload(source, target, verbose, **kwargs):
>          print(f"WARNING: target {target} does not exist and could not be created. Not
> uploading.")
>          return 0
> 
> +    print(f"INFO: scanning {target}")
> +    all_files = target.list_all()
> +
> +    def target_file_present(file_path):
> +        for file in all_files:
> +            if file.path == file_path:
> +                return file
> +
>      print(f"INFO: uploading {source} to {target}")
>      os.chdir(source)
> -    upload, exists = [], []
> +    upload, exists, update = [], [], []
>      for subdir, dirs, files in os.walk('.'):
>          target_dirs = subdir.split('/')[1:]
>          for f in files:
>              file_path = (('/'.join(target_dirs) + '/') if len(target_dirs) > 0 else '') + f
> -            if target.exists(file_path):
> +            target_file = target_file_present(file_path)
> +            if target_file:
>                  if verbose:
>                      print(f"[EXISTS] {file_path}")
>                  exists.append(file_path)
> +                if target_file.age > convert_duration_string_to_seconds(max_age):
> +                    update.append((file_path, target_dirs))
> +                    if verbose:
> +                        print(f"[UPDATE] {file_path}")
>              else:
>                  upload.append((file_path, target_dirs))
> -    upload_gb = (sum([os.path.getsize(f[0]) for f in upload]) / 1024.0 / 1024.0 /
> 1024.0)
> +    upload_gb = (sum([os.path.getsize(f[0]) for f in (upload + update)]) / 1024.0 /
> 1024.0 / 1024.0)
>      print(f"INFO: uploading {len(upload)} files ({upload_gb:.02f} GB)")
>      print(f"INFO: {len(exists)} files already present on target")
> -    for file_path, target_dirs in upload:
> +    print(f"INFO: {len(update)} files will be refreshed")
> +    for file_path, target_dirs in upload + update:
>          if verbose:
>              print(f"[UPLOAD] {file_path}")
>          target.mkdir('/'.join(target_dirs))
> @@ -665,24 +695,13 @@ def sstate_upload(source, target, verbose, **kwargs):
> 
> 
>  def sstate_clean(target, max_age, max_sig_age, verbose, **kwargs):
> -    def convert_to_seconds(x):
> -        seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w': 604800}
> -        m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
> -        if m is None:
> -            return None
> -        unit = m.group(2)
> -        if unit is None:
> -            print("WARNING: MAX_AGE without unit, assuming 'days'")
> -            unit = 'd'
> -        return int(m.group(1)) * seconds_per_unit[unit]
> -
> -    max_age_seconds = convert_to_seconds(max_age)
> +    max_age_seconds = convert_duration_string_to_seconds(max_age)
>      if max_age_seconds is None:
>          print(f"ERROR: cannot parse MAX_AGE '{max_age}', needs to be a number
> followed by w|d|h|m|s")
>          return 1
>      if max_sig_age is None:
>          max_sig_age = max_age
> -    max_sig_age_seconds = max(max_age_seconds,
> convert_to_seconds(max_sig_age))
> +    max_sig_age_seconds = max(max_age_seconds,
> convert_duration_string_to_seconds(max_sig_age))
> 
>      if not target.exists():
>          print(f"WARNING: cannot access target {target}. Nothing to clean.")
> --
> 2.40.1
> 
> --
> You received this message because you are subscribed to the Google Groups "isar-
> users" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to
> isar-users+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/isar-
> users/20240723122703.1210290-1-tobias.schaffner%40siemens.com.

Patch

diff --git a/scripts/isar-sstate b/scripts/isar-sstate
index 4ea38bc8..a60f50dd 100755
--- a/scripts/isar-sstate
+++ b/scripts/isar-sstate
@@ -32,6 +32,11 @@  and supports three remote backends (filesystem, http/webdav, AWS S3).
 The `upload` command pushes the contents of a local sstate cache to the
 remote location, uploading all files that don't already exist on the remote.
 
+`--max-age` specifies after which time artifacts in the cache should be
+refreshed. Files older than this age will be reuploaded to update its timestamp.
+This value should be chosen to be smaller than the clean max-age to ensure that
+the artifacts are refreshed before they are cleaned.
+
 ### clean
 
 The `clean` command deletes old artifacts from the remote cache. It takes two
@@ -179,6 +184,17 @@  StampsRegex = re.compile(
     r"(.*/)?(?P<arch>[^/]+)/(?P<pn>[^/]+)/([^/]+)\.do_(?P<task>[^/]+)\.(?P<suffix>sigdata)\.(?P<hash>[0-9a-f]{64})"
 )
 
+def convert_duration_string_to_seconds(x):
+    seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w': 604800}
+    m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
+    if m is None:
+        return None
+    unit = m.group(2)
+    if unit is None:
+        print("WARNING: MAX_AGE without unit, assuming 'days'")
+        unit = 'd'
+    return int(m.group(1)) * seconds_per_unit[unit]
+
 class SstateTargetBase(object):
     def __init__(self, path, cached=False):
         """Constructor
@@ -598,7 +614,7 @@  def arguments():
         '-v', '--verbose', default=False, action='store_true')
     parser.add_argument(
         '--max-age', type=str, default='1d',
-        help="clean: remove archive files older than MAX_AGE (a number followed by w|d|h|m|s)")
+        help="clean/upload: remove/reupload archive files older than MAX_AGE (a number followed by w|d|h|m|s)")
     parser.add_argument(
         '--max-sig-age', type=str, default=None,
         help="clean: remove siginfo files older than MAX_SIG_AGE (defaults to MAX_AGE)")
@@ -632,7 +648,7 @@  def arguments():
     return args
 
 
-def sstate_upload(source, target, verbose, **kwargs):
+def sstate_upload(source, target, verbose, max_age="1d", **kwargs):
     if not os.path.isdir(source):
         print(f"WARNING: source {source} does not exist. Not uploading.")
         return 0
@@ -640,23 +656,37 @@  def sstate_upload(source, target, verbose, **kwargs):
         print(f"WARNING: target {target} does not exist and could not be created. Not uploading.")
         return 0
 
+    print(f"INFO: scanning {target}")
+    all_files = target.list_all()
+
+    def target_file_present(file_path):
+        for file in all_files:
+            if file.path == file_path:
+                return file
+
     print(f"INFO: uploading {source} to {target}")
     os.chdir(source)
-    upload, exists = [], []
+    upload, exists, update = [], [], []
     for subdir, dirs, files in os.walk('.'):
         target_dirs = subdir.split('/')[1:]
         for f in files:
             file_path = (('/'.join(target_dirs) + '/') if len(target_dirs) > 0 else '') + f
-            if target.exists(file_path):
+            target_file = target_file_present(file_path)
+            if target_file:
                 if verbose:
                     print(f"[EXISTS] {file_path}")
                 exists.append(file_path)
+                if target_file.age > convert_duration_string_to_seconds(max_age):
+                    update.append((file_path, target_dirs))
+                    if verbose:
+                        print(f"[UPDATE] {file_path}")
             else:
                 upload.append((file_path, target_dirs))
-    upload_gb = (sum([os.path.getsize(f[0]) for f in upload]) / 1024.0 / 1024.0 / 1024.0)
+    upload_gb = (sum([os.path.getsize(f[0]) for f in (upload + update)]) / 1024.0 / 1024.0 / 1024.0)
     print(f"INFO: uploading {len(upload)} files ({upload_gb:.02f} GB)")
     print(f"INFO: {len(exists)} files already present on target")
-    for file_path, target_dirs in upload:
+    print(f"INFO: {len(update)} files will be refreshed")
+    for file_path, target_dirs in upload + update:
         if verbose:
             print(f"[UPLOAD] {file_path}")
         target.mkdir('/'.join(target_dirs))
@@ -665,24 +695,13 @@  def sstate_upload(source, target, verbose, **kwargs):
 
 
 def sstate_clean(target, max_age, max_sig_age, verbose, **kwargs):
-    def convert_to_seconds(x):
-        seconds_per_unit = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400, 'w': 604800}
-        m = re.match(r'^(\d+)(w|d|h|m|s)?', x)
-        if m is None:
-            return None
-        unit = m.group(2)
-        if unit is None:
-            print("WARNING: MAX_AGE without unit, assuming 'days'")
-            unit = 'd'
-        return int(m.group(1)) * seconds_per_unit[unit]
-
-    max_age_seconds = convert_to_seconds(max_age)
+    max_age_seconds = convert_duration_string_to_seconds(max_age)
     if max_age_seconds is None:
         print(f"ERROR: cannot parse MAX_AGE '{max_age}', needs to be a number followed by w|d|h|m|s")
         return 1
     if max_sig_age is None:
         max_sig_age = max_age
-    max_sig_age_seconds = max(max_age_seconds, convert_to_seconds(max_sig_age))
+    max_sig_age_seconds = max(max_age_seconds, convert_duration_string_to_seconds(max_sig_age))
 
     if not target.exists():
         print(f"WARNING: cannot access target {target}. Nothing to clean.")