"SfR Fresh" - the SfR Freeware/Shareware Archive

Member "spambayes-1.0.4/spambayes/cdb.py" of archive spambayes-1.0.4.zip:


As a special service "SfR Fresh" has tried to format the requested source page into HTML format using source code syntax highlighting with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
    1 #! /usr/bin/env python
    2 """
    3 Dan Bernstein's CDB implemented in Python
    4 
    5 see http://cr.yp.to/cdb.html
    6 
    7 """
    8 
    9 from __future__ import generators
   10 
   11 import os
   12 import struct
   13 import mmap
   14 
   15 def uint32_unpack(buf):
   16     return struct.unpack('<L', buf)[0]
   17 
   18 def uint32_pack(n):
   19     return struct.pack('<L', n)
   20 
   21 CDB_HASHSTART = 5381
   22 
   23 def cdb_hash(buf):
   24     h = CDB_HASHSTART
   25     for c in buf:
   26         h = (h + (h << 5)) & 0xffffffffL
   27         h ^= ord(c)
   28     return h
   29 
   30 class Cdb(object):
   31 
   32     def __init__(self, fp):
   33         self.fp = fp
   34         fd = fp.fileno()
   35         self.size = os.fstat(fd).st_size
   36         self.map = mmap.mmap(fd, self.size, access=mmap.ACCESS_READ)
   37         self.eod = uint32_unpack(self.map[:4])
   38         self.findstart()
   39         self.loop = 0 # number of hash slots searched under this key
   40         # initialized if loop is nonzero
   41         self.khash = 0
   42         self.hpos = 0
   43         self.hslots = 0
   44         # initialized if findnext() returns 1
   45         self.dpos = 0
   46         self.dlen = 0
   47 
   48     def close(self):
   49         self.map.close()
   50 
   51     def __iter__(self, fn=None):
   52         len = 2048
   53         while len < self.eod:
   54             klen, vlen = struct.unpack("<LL", self.map[len:len+8])
   55             len += 8
   56             key = self.map[len:len+klen]
   57             len += klen
   58             val = self.map[len:len+vlen]
   59             len += vlen
   60             if fn:
   61                 yield fn(key, val)
   62             else:
   63                 yield (key, val)
   64 
   65     def iteritems(self):
   66         return self.__iter__()
   67 
   68     def iterkeys(self):
   69         return self.__iter__(lambda k,v: k)
   70 
   71     def itervalues(self):
   72         return self.__iter__(lambda k,v: v)
   73 
   74     def items(self):
   75         ret = []
   76         for i in self.iteritems():
   77             ret.append(i)
   78         return ret
   79 
   80     def keys(self):
   81         ret = []
   82         for i in self.iterkeys():
   83             ret.append(i)
   84         return ret
   85 
   86     def values(self):
   87         ret = []
   88         for i in self.itervalues():
   89             ret.append(i)
   90         return ret
   91 
   92     def findstart(self):
   93         self.loop = 0
   94 
   95     def read(self, n, pos):
   96         # XXX add code for platforms without mmap
   97         return self.map[pos:pos+n]
   98 
   99     def match(self, key, pos):
  100         if key == self.read(len(key), pos):
  101             return 1
  102         else:
  103             return 0
  104 
  105     def findnext(self, key):
  106         if not self.loop:
  107             u = cdb_hash(key)
  108             buf = self.read(8, u << 3 & 2047)
  109             self.hslots = uint32_unpack(buf[4:])
  110             if not self.hslots:
  111                 raise KeyError
  112             self.hpos = uint32_unpack(buf[:4])
  113             self.khash = u
  114             u >>= 8
  115             u %= self.hslots
  116             u <<= 3
  117             self.kpos = self.hpos + u
  118 
  119         while self.loop < self.hslots:
  120             buf = self.read(8, self.kpos)
  121             pos = uint32_unpack(buf[4:])
  122             if not pos:
  123                 raise KeyError
  124             self.loop += 1
  125             self.kpos += 8
  126             if self.kpos == self.hpos + (self.hslots << 3):
  127                 self.kpos = self.hpos
  128             u = uint32_unpack(buf[:4])
  129             if u == self.khash:
  130                 buf = self.read(8, pos)
  131                 u = uint32_unpack(buf[:4])
  132                 if u == len(key):
  133                     if self.match(key, pos + 8):
  134                         dlen = uint32_unpack(buf[4:])
  135                         dpos = pos + 8 + len(key)
  136                         return self.read(dlen, dpos)
  137         raise KeyError
  138 
  139     def __getitem__(self, key):
  140         self.findstart()
  141         return self.findnext(key)
  142 
  143     def get(self, key, default=None):
  144         self.findstart()
  145         try:
  146             return self.findnext(key)
  147         except KeyError:
  148             return default
  149 
  150 def cdb_dump(infile):
  151     """dump a database in djb's cdbdump format"""
  152     db = Cdb(infile)
  153     for key,value in db.iteritems():
  154         print "+%d,%d:%s->%s" % (len(key), len(value), key, value)
  155     print
  156 
  157 def cdb_make(outfile, items):
  158     pos = 2048
  159     tables = {} # { h & 255 : [(h, p)] }
  160 
  161     # write keys and data
  162     outfile.seek(pos)
  163     for key, value in items:
  164         outfile.write(uint32_pack(len(key)) + uint32_pack(len(value)))
  165         h = cdb_hash(key)
  166         outfile.write(key)
  167         outfile.write(value)
  168         tables.setdefault(h & 255, []).append((h, pos))
  169         pos += 8 + len(key) + len(value)
  170 
  171     final = ''
  172     # write hash tables
  173     for i in range(256):
  174         entries = tables.get(i, [])
  175         nslots = 2*len(entries)
  176         final += uint32_pack(pos) + uint32_pack(nslots)
  177         null = (0, 0)
  178         table = [null] * nslots
  179         for h, p in entries:
  180             n = (h >> 8) % nslots
  181             while table[n] is not null:
  182                 n = (n + 1) % nslots
  183             table[n] = (h, p)
  184         for h, p in table:
  185             outfile.write(uint32_pack(h) + uint32_pack(p))
  186             pos += 8
  187 
  188     # write header (pointers to tables and their lengths)
  189     outfile.flush()
  190     outfile.seek(0)
  191     outfile.write(final)
  192 
  193 
  194 def test():
  195     #db = Cdb(open("t"))
  196     #print db['one']
  197     #print db['two']
  198     #print db['foo']
  199     #print db['us']
  200     #print db.get('ec')
  201     #print db.get('notthere')
  202     db = open('test.cdb', 'wb')
  203     cdb_make(db,
  204              [('one', 'Hello'),
  205               ('two', 'Goodbye'),
  206               ('foo', 'Bar'),
  207               ('us', 'United States'),
  208               ])
  209     db.close()
  210     db = Cdb(open("test.cdb", 'rb'))
  211     print db['one']
  212     print db['two']
  213     print db['foo']
  214     print db['us']
  215     print db.get('ec')
  216     print db.get('notthere')
  217 
  218 if __name__ == '__main__':
  219     test()