Source code for pypersist.diskcache

"""Persistent memoisation backend that saves results in the local file system

The `persist` decorator takes a `cache` argument, which details what sort of
backend to use for the cache.  If this string begins with "file://", or if no
`cache` is specified, then a *disk cache* is used, which saves computed results
to a directory in the local file system.  This internal work is done by the
classes defined below.

"""

from .commoncache import HashCollisionError

from os import makedirs, remove, listdir
from os.path import exists, join
from time import sleep

from sys import version_info

PYTHON_VERSION = version_info[0]  # major version number
if PYTHON_VERSION >= 3:
    from collections.abc import MutableMapping, Iterator
else:
    from collections import MutableMapping, Iterator


[docs]class Cache: """Dictionary-like object for saving function outputs to disk This cache, which can be used by the `persist` decorator in `persist.py`, stores computed values on disk in a specified directory so that they can be restored later using a key. Like a dictionary, a key-value pair can be added using `cache[key] = val`, looked up using `cache[key]`, and removed using `del cache[key]`. The number of values stored can be found using `len(cache)`. A disk cache might not store its keys, and therefore we cannot iterate through its keys as we can with a dictionary. However, see `CacheWithKeys`. Parameters ---------- func : persist_wrapper Memoised function whose results this is caching. Options which are not specific to local disk storage, such as the key, hash, and pickle functions, are taken from this. dir : str Directory into which to save results. The same directory can be used for several different functions, since a subdirectory will be created for each function based on its `funcname`. """ def __init__(self, func, dir): self._func = func self._dir = join(dir, self._func._funcname) if not exists(self._dir): makedirs(self._dir) def __getitem__(self, key): lockfname = self._key_to_fname(key, LOCK) while exists(lockfname): sleep(0.1) # wait before reading fname = self._key_to_fname(key, OUT) # print that we are getting cache depending on verbosity. if self._func._verbosity == 3: print("Getting cache.") elif self._func._verbosity > 3: print( "Getting key {key} from {fname}.".format(key=key, fname=fname) ) if self._func._unhash: storedkey = self._fname_to_key(fname) if storedkey != key: # Print errors if high enough verbosity. if self._func._verbosity > 0: print( "Key ({key}) does not match stored key ({storedkey}).".format( key=key, storedkey=storedkey ) ) raise HashCollisionError(storedkey, key) if exists(fname): if self._func._storekey: keyfname = self._key_to_fname(key, KEY) assert exists(fname) keyfile = open(keyfname, "r") keystring = keyfile.read() keyfile.close() storedkey = self._func._unpickle(keystring) if storedkey != key: raise HashCollisionError(storedkey, key) file = open(fname, "r") val = self._func._unpickle(file.read()) file.close() else: # Not found in cache (not a problem, just a new value) if self._func._verbosity >= 3: print( "No entry for {key} as {fname} does not exist.".format( key=key, fname=fname ) ) raise KeyError(key) # print that we are done getting cache. if self._func._verbosity > 2: print("Done reading cache.") return val def __setitem__(self, key, val): to_write = [] # list of (filename, string) pairs # .out file outfname = self._key_to_fname(key, OUT) outstring = self._func._pickle(val) to_write.append((outfname, outstring)) # .key file if self._func._storekey: keyfname = self._key_to_fname(key, KEY) keystring = self._func._pickle(key) to_write.append((keyfname, keystring)) # .meta file if self._func._metadata: metafname = self._key_to_fname(key, META) metastring = self._func._metadata() to_write.append((metafname, metastring)) # get a lock on this result lockfname = self._key_to_fname(key, LOCK) if exists(lockfname) or exists(outfname): return # another thread got here first - abort! open(lockfname, "w").close() # print what we're writing with different levels of verbosity if self._func._verbosity == 2: print("Writing to files.") # do the file operations for (fname, string) in to_write: if self._func._verbosity == 3: print("Writing to {fname}".format(fname=fname)) elif self._func._verbosity > 3: print( "Writing {string} to {fname}".format( string=string, fname=fname ) ) file = open(fname, "w") file.write(string) file.close() if self._func._verbosity > 2: print("Done writing {fname}.".format(fname=fname)) # print that files have been written if self._func._verbosity > 2: print("Done writing all files.") # unlock this result remove(lockfname) def __delitem__(self, key): lockfname = self._key_to_fname(key, LOCK) while exists(lockfname): sleep(0.1) # wait before reading # print what we're deleting depending on verbosity. if self._func._verbosity == 3: print("Deleting cache item.") for ext in [OUT, KEY, META]: fname = self._key_to_fname(key, ext) # print what we're deleting depending on verbosity. if self._func._verbosity > 3: print( "Deleting cache item {key} in file {fname}.".format( fname=fname, key=key ) ) if exists(fname): remove(fname) elif ext == OUT: raise KeyError(key) # print what we've finished deleting if self._func._verbosity > 3: print("File {fname} deleted.".format(fname=fname)) # print that we're done deleting if self._func._verbosity >= 3: print("Done deleting cache item.") def __len__(self): # Number of files ending with ".out" return sum(fname.endswith(OUT) for fname in listdir(self._dir))
[docs] def clear(self): """Delete all the results stored in this cache""" # print that we are clearing cache depending on verbosity. if self._func._verbosity > 1: print("Clearing cache.") for f in listdir(self._dir): path = join(self._dir, f) # TODO: safety checks? remove(path) # print that we are done clearing cache depending on verbosity. if self._func._verbosity > 2: print("Cache cleared.")
def _key_to_fname(self, key, ext): h = self._func._hash(key) return join(self._dir, h + ext) def _fname_to_key(self, fname): if fname.startswith(self._dir): fname = fname[len(self._dir + "/") :] # remove directory h = fname[: fname.rfind(".")] # remove extension return self._func._unhash(h)
[docs]class CacheWithKeys(Cache, MutableMapping): """Mutable mapping for saving function outputs to disk This subclass of `Cache` can be used in place of `Cache` whenever `storekey` is True or `unhash` is set, to implement the `MutableMapping` abstract base class. This allows the cache to be used exactly like a dictionary, including the ability to iterate through all keys in the cache. """ def __iter__(self): return self.KeysIter(self)
[docs] class KeysIter(Iterator): """Iterator class for the keys of a `CacheWithKeys` object""" def __init__(self, cache): self._cache = cache self._pos = 0 self._files = [ fname for fname in listdir(self._cache._dir) if fname.endswith(OUT) ] def __next__(self): if self._pos >= len(self._files): raise StopIteration fname = self._files[self._pos] if self._cache._func._unhash: # Unhash from filename key = self._cache._fname_to_key(fname) else: assert self._cache._func._storekey # Read key from file path = join(self._cache._dir, fname) path = path[: -len(OUT)] + KEY file = open(path, "r") string = file.read() key = self._cache._func._unpickle(string) file.close() self._pos += 1 return key next = __next__ # for Python 2 compatibility
# Filename extensions OUT = ".out" KEY = ".key" META = ".meta" LOCK = ".lock"