1"""A dumb and slow but simple dbm clone.
2
3For database spam, spam.dir contains the index (a text file),
4spam.bak *may* contain a backup of the index (also a text file),
5while spam.dat contains the data (a binary file).
6
7XXX TO DO:
8
9- seems to contain a bug when updating...
10
11- reclaim free space (currently, space once occupied by deleted or expanded
12items is never reused)
13
14- support concurrent access (currently, if two processes take turns making
15updates, they can mess up the index)
16
17- support efficient access to large databases (currently, the whole index
18is read when the database is opened, and some updates rewrite the whole index)
19
20- support opening for read-only (flag = 'm')
21
22"""
23
24import ast as _ast
25import io as _io
26import os as _os
27import collections.abc
28
29__all__ = ["error", "open"]
30
31_BLOCKSIZE = 512
32
33error = OSError
34
35class _Database(collections.abc.MutableMapping):
36
37    # The on-disk directory and data files can remain in mutually
38    # inconsistent states for an arbitrarily long time (see comments
39    # at the end of __setitem__).  This is only repaired when _commit()
40    # gets called.  One place _commit() gets called is from __del__(),
41    # and if that occurs at program shutdown time, module globals may
42    # already have gotten rebound to None.  Since it's crucial that
43    # _commit() finish successfully, we can't ignore shutdown races
44    # here, and _commit() must not reference any globals.
45    _os = _os       # for _commit()
46    _io = _io       # for _commit()
47
48    def __init__(self, filebasename, mode, flag='c'):
49        self._mode = mode
50        self._readonly = (flag == 'r')
51
52        # The directory file is a text file.  Each line looks like
53        #    "%r, (%d, %d)\n" % (key, pos, siz)
54        # where key is the string key, pos is the offset into the dat
55        # file of the associated value's first byte, and siz is the number
56        # of bytes in the associated value.
57        self._dirfile = filebasename + '.dir'
58
59        # The data file is a binary file pointed into by the directory
60        # file, and holds the values associated with keys.  Each value
61        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
62        # binary 8-bit string value.
63        self._datfile = filebasename + '.dat'
64        self._bakfile = filebasename + '.bak'
65
66        # The index is an in-memory dict, mirroring the directory file.
67        self._index = None  # maps keys to (pos, siz) pairs
68
69        # Handle the creation
70        self._create(flag)
71        self._update(flag)
72
73    def _create(self, flag):
74        if flag == 'n':
75            for filename in (self._datfile, self._bakfile, self._dirfile):
76                try:
77                    _os.remove(filename)
78                except OSError:
79                    pass
80        # Mod by Jack: create data file if needed
81        try:
82            f = _io.open(self._datfile, 'r', encoding="Latin-1")
83        except OSError:
84            if flag not in ('c', 'n'):
85                import warnings
86                warnings.warn("The database file is missing, the "
87                              "semantics of the 'c' flag will be used.",
88                              DeprecationWarning, stacklevel=4)
89            with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
90                self._chmod(self._datfile)
91        else:
92            f.close()
93
94    # Read directory file into the in-memory index dict.
95    def _update(self, flag):
96        self._index = {}
97        try:
98            f = _io.open(self._dirfile, 'r', encoding="Latin-1")
99        except OSError:
100            self._modified = not self._readonly
101            if flag not in ('c', 'n'):
102                import warnings
103                warnings.warn("The index file is missing, the "
104                              "semantics of the 'c' flag will be used.",
105                              DeprecationWarning, stacklevel=4)
106        else:
107            self._modified = False
108            with f:
109                for line in f:
110                    line = line.rstrip()
111                    key, pos_and_siz_pair = _ast.literal_eval(line)
112                    key = key.encode('Latin-1')
113                    self._index[key] = pos_and_siz_pair
114
115    # Write the index dict to the directory file.  The original directory
116    # file (if any) is renamed with a .bak extension first.  If a .bak
117    # file currently exists, it's deleted.
118    def _commit(self):
119        # CAUTION:  It's vital that _commit() succeed, and _commit() can
120        # be called from __del__().  Therefore we must never reference a
121        # global in this routine.
122        if self._index is None or not self._modified:
123            return  # nothing to do
124
125        try:
126            self._os.unlink(self._bakfile)
127        except OSError:
128            pass
129
130        try:
131            self._os.rename(self._dirfile, self._bakfile)
132        except OSError:
133            pass
134
135        with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
136            self._chmod(self._dirfile)
137            for key, pos_and_siz_pair in self._index.items():
138                # Use Latin-1 since it has no qualms with any value in any
139                # position; UTF-8, though, does care sometimes.
140                entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
141                f.write(entry)
142
143    sync = _commit
144
145    def _verify_open(self):
146        if self._index is None:
147            raise error('DBM object has already been closed')
148
149    def __getitem__(self, key):
150        if isinstance(key, str):
151            key = key.encode('utf-8')
152        self._verify_open()
153        pos, siz = self._index[key]     # may raise KeyError
154        with _io.open(self._datfile, 'rb') as f:
155            f.seek(pos)
156            dat = f.read(siz)
157        return dat
158
159    # Append val to the data file, starting at a _BLOCKSIZE-aligned
160    # offset.  The data file is first padded with NUL bytes (if needed)
161    # to get to an aligned offset.  Return pair
162    #     (starting offset of val, len(val))
163    def _addval(self, val):
164        with _io.open(self._datfile, 'rb+') as f:
165            f.seek(0, 2)
166            pos = int(f.tell())
167            npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
168            f.write(b'\0'*(npos-pos))
169            pos = npos
170            f.write(val)
171        return (pos, len(val))
172
173    # Write val to the data file, starting at offset pos.  The caller
174    # is responsible for ensuring that there's enough room starting at
175    # pos to hold val, without overwriting some other value.  Return
176    # pair (pos, len(val)).
177    def _setval(self, pos, val):
178        with _io.open(self._datfile, 'rb+') as f:
179            f.seek(pos)
180            f.write(val)
181        return (pos, len(val))
182
183    # key is a new key whose associated value starts in the data file
184    # at offset pos and with length siz.  Add an index record to
185    # the in-memory index dict, and append one to the directory file.
186    def _addkey(self, key, pos_and_siz_pair):
187        self._index[key] = pos_and_siz_pair
188        with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
189            self._chmod(self._dirfile)
190            f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
191
192    def __setitem__(self, key, val):
193        if self._readonly:
194            import warnings
195            warnings.warn('The database is opened for reading only',
196                          DeprecationWarning, stacklevel=2)
197        if isinstance(key, str):
198            key = key.encode('utf-8')
199        elif not isinstance(key, (bytes, bytearray)):
200            raise TypeError("keys must be bytes or strings")
201        if isinstance(val, str):
202            val = val.encode('utf-8')
203        elif not isinstance(val, (bytes, bytearray)):
204            raise TypeError("values must be bytes or strings")
205        self._verify_open()
206        self._modified = True
207        if key not in self._index:
208            self._addkey(key, self._addval(val))
209        else:
210            # See whether the new value is small enough to fit in the
211            # (padded) space currently occupied by the old value.
212            pos, siz = self._index[key]
213            oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
214            newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
215            if newblocks <= oldblocks:
216                self._index[key] = self._setval(pos, val)
217            else:
218                # The new value doesn't fit in the (padded) space used
219                # by the old value.  The blocks used by the old value are
220                # forever lost.
221                self._index[key] = self._addval(val)
222
223            # Note that _index may be out of synch with the directory
224            # file now:  _setval() and _addval() don't update the directory
225            # file.  This also means that the on-disk directory and data
226            # files are in a mutually inconsistent state, and they'll
227            # remain that way until _commit() is called.  Note that this
228            # is a disaster (for the database) if the program crashes
229            # (so that _commit() never gets called).
230
231    def __delitem__(self, key):
232        if self._readonly:
233            import warnings
234            warnings.warn('The database is opened for reading only',
235                          DeprecationWarning, stacklevel=2)
236        if isinstance(key, str):
237            key = key.encode('utf-8')
238        self._verify_open()
239        self._modified = True
240        # The blocks used by the associated value are lost.
241        del self._index[key]
242        # XXX It's unclear why we do a _commit() here (the code always
243        # XXX has, so I'm not changing it).  __setitem__ doesn't try to
244        # XXX keep the directory file in synch.  Why should we?  Or
245        # XXX why shouldn't __setitem__?
246        self._commit()
247
248    def keys(self):
249        try:
250            return list(self._index)
251        except TypeError:
252            raise error('DBM object has already been closed') from None
253
254    def items(self):
255        self._verify_open()
256        return [(key, self[key]) for key in self._index.keys()]
257
258    def __contains__(self, key):
259        if isinstance(key, str):
260            key = key.encode('utf-8')
261        try:
262            return key in self._index
263        except TypeError:
264            if self._index is None:
265                raise error('DBM object has already been closed') from None
266            else:
267                raise
268
269    def iterkeys(self):
270        try:
271            return iter(self._index)
272        except TypeError:
273            raise error('DBM object has already been closed') from None
274    __iter__ = iterkeys
275
276    def __len__(self):
277        try:
278            return len(self._index)
279        except TypeError:
280            raise error('DBM object has already been closed') from None
281
282    def close(self):
283        try:
284            self._commit()
285        finally:
286            self._index = self._datfile = self._dirfile = self._bakfile = None
287
288    __del__ = close
289
290    def _chmod(self, file):
291        if hasattr(self._os, 'chmod'):
292            self._os.chmod(file, self._mode)
293
294    def __enter__(self):
295        return self
296
297    def __exit__(self, *args):
298        self.close()
299
300
301def open(file, flag='c', mode=0o666):
302    """Open the database file, filename, and return corresponding object.
303
304    The flag argument, used to control how the database is opened in the
305    other DBM implementations, supports only the semantics of 'c' and 'n'
306    values.  Other values will default to the semantics of 'c' value:
307    the database will always opened for update and will be created if it
308    does not exist.
309
310    The optional mode argument is the UNIX mode of the file, used only when
311    the database has to be created.  It defaults to octal code 0o666 (and
312    will be modified by the prevailing umask).
313
314    """
315
316    # Modify mode depending on the umask
317    try:
318        um = _os.umask(0)
319        _os.umask(um)
320    except AttributeError:
321        pass
322    else:
323        # Turn off any bits that are set in the umask
324        mode = mode & (~um)
325    if flag not in ('r', 'w', 'c', 'n'):
326        import warnings
327        warnings.warn("Flag must be one of 'r', 'w', 'c', or 'n'",
328                      DeprecationWarning, stacklevel=2)
329    return _Database(file, mode, flag=flag)
330