Sunday, October 19, 2008

Pure Python Dictionary Implementation

For those curious about how CPython's dict implementation works, I've written a Python implementation using the same algorithms. Aside from the education value, it's pretty useless because it doesn't support None as a value and is extremely slow. You can get the source in a Bazaar repo: http://code.python.org/python/users/benjamin.peterson/pydict/

"""
A Python dict implementation.
"""

import collections

MINSIZE = 8
PERTURB_SHIFT = 5
dummy = "<dummy key>"


class Entry(object):
"""
A hash table entry.

Attributes:
* key - The key for this entry.
* hash - The has of the key.
* value - The value associated with the key.
"""

__slots__ = ("key", "value", "hash")

def __init__(self):
self.key = None
self.value = None
self.hash = 0

def __repr__(self):
return "<Entry: key={0} value={1}>".format(self.key, self.value)



class Dict(object):
"""
A mapping interface implemented as a hash table.

Attributes:
* used - The number of entires used in the table.
* filled - used + number of entries with a dummy key.
* table - List of entries; contains the actual dict data.
* mask - Length of table - 1. Used to fetch values.
"""

__slots__ = ("filled", "used", "mask", "table")


def __init__(self, arg=None, **kwargs):
self.clear()
self._update(arg, kwargs)

@classmethod
def fromkeys(cls, keys, value=0):
"""
Return a new dictionary from a sequence of keys.
"""
d = cls()
for key in keys:
d[key] = value
return d

def clear(self):
"""
Clear the dictionary of all data.
"""
self.filled = 0
self.used = 0
self.mask = MINSIZE - 1
self.table = []
# Initialize the table to a clean slate of entries.
for i in range(MINSIZE):
self.table.append(Entry())

def pop(self, *args):
"""
Remove and return the value for a key.
"""
have_default = len(args) == 2
try:
v = self[args[0]]
except KeyError:
if have_default:
return args[1]
raise
else:
del self[args[0]]
return v

def popitem(self):
"""
Remove and return any key-value pair from the dictionary.
"""
if self.used == 0:
raise KeyError("empty dictionary")
entry0 = self.table[0]
entry = entry0
i = 0
if entry0.value is None:
# The first entry in the table's hash is abused to hold the index to
# the next place to look for a value to pop.
i = entry0.hash
if i > self.mask or i < i:
i = 1
entry = self.table[i]
while entry.value is None:
i += 1
if i > self.mask:
i = 1
entry = self.table[i]
res = entry.key, entry.value
self._del(entry)
# Set the next place to start.
entry0.hash = i + 1
return res

def setdefault(self, key, default=0):
"""
If key is in the dictionary, return it. Otherwise, set it to the default
value.
"""
val = self._lookup(key).value
if val is None:
self[key] = default
return default
return val

def _lookup(self, key):
"""
Find the entry for a key.
"""
key_hash = hash(key)
i = key_hash & self.mask
entry = self.table[i]
if entry.key is None or entry is key:
return entry
free = None
if entry.key is dummy:
free = entry
elif entry.hash == key_hash and key == entry.key:
return entry

perturb = key_hash
while True:
i = (i << 2) + i + perturb + 1;
entry = self.table[i & self.mask]
if entry.key is None:
return entry if free is None else free
if entry.key is key or \
(entry.hash == key_hash and key == entry.key):
return entry
elif entry.key is dummy and free is None:
free = dummy
perturb >>= PERTURB_SHIFT

assert False, "not reached"

def _resize(self, minused):
"""
Resize the dictionary to at least minused.
"""
newsize = MINSIZE
# Find the smalled value for newsize.
while newsize <= minused and newsize > 0:
newsize <<= 1
oldtable = self.table
# Create a new table newsize long.
newtable = []
while len(newtable) < newsize:
newtable.append(Entry())
# Replace the old table.
self.table = newtable
self.used = 0
self.filled = 0
# Copy the old data into the new table.
for entry in oldtable:
if entry.value is not None:
self._insert_into_clean(entry)
elif entry.key is dummy:
entry.key = None
self.mask = newsize - 1

def _insert_into_clean(self, entry):
"""
Insert an item in a clean dict. This is a helper for resizing.
"""
i = entry.hash & self.mask
new_entry = self.table[i]
perturb = entry.hash
while new_entry.key is not None:
i = (i << 2) + i + perturb + 1
new_entry = self.table[i & self.mask]
perturb >>= PERTURB_SHIFT
new_entry.key = entry.key
new_entry.value = entry.value
new_entry.hash = entry.hash
self.used += 1
self.filled += 1

def _insert(self, key, value):
"""
Add a new value to the dictionary or replace an old one.
"""
entry = self._lookup(key)
if entry.value is None:
self.used += 1
if entry.key is not dummy:
self.filled += 1
entry.key = key
entry.hash = hash(key)
entry.value = value

def _del(self, entry):
"""
Mark an entry as free with the dummy key.
"""
entry.key = dummy
entry.value = None
self.used -= 1

def __getitem__(self, key):
value = self._lookup(key).value
if value is None:
# Check if we're a subclass.
if type(self) is not Dict:
# Try to call the __missing__ method.
missing = getattr(self, "__missing__")
if missing is not None:
return missing(key)
raise KeyError("no such key: {0!r}".format(key))
return value

def __setitem__(self, key, what):
# None is used as a marker for empty entries, so it can't be in a
# dictionary.
assert what is not None and key is not None, \
"key and value must not be None"
old_used = self.used
self._insert(key, what)
# Maybe resize the dict.
if not (self.used > old_used and
self.filled*3 >= (self.mask + 1)*2):
return
# Large dictionaries (< 5000) are only doubled in size.
factor = 2 if self.used > 5000 else 4
self._resize(factor*self.used)

def __delitem__(self, key):
entry = self._lookup(key)
if entry.value is None:
raise KeyError("no such key: {0!r}".format(key))
self._del(entry)

def __contains__(self, key):
"""
Check if a key is in the dictionary.
"""
return self._lookup(key).value is not None

def __eq__(self, other):
if not isinstance(other, Dict):
try:
# Try to coerce the other to a Dict, so we can compare it.
other = Dict(other)
except TypeError:
return NotImplemented
if self.used != other.used:
# They're not the same size.
return False
# Look through the table and compare every entry, breaking out early if
# we find a difference.
for entry in self.table:
if entry.value is not None:
try:
bval = other[entry.key]
except KeyError:
return False
if not bval == entry.value:
return False
return True

def __ne__(self, other):
return not self == other

def keys(self):
"""
Return a list of keys in the dictionary.
"""
return [entry.key for entry in self.table if entry.value is not None]

def values(self):
"""
Return a list of values in the dictionary.
"""
return [entry.value for entry in self.table if entry.value is not None]

def items(self):
"""
Return a list of key-value pairs.
"""
return [(entry.key, entry.value) for entry in self.table
if entry.value is not None]

def __iter__(self):
return DictKeysIterator(self)

def itervalues(self):
"""
Return an iterator over the values in the dictionary.
"""
return DictValuesIterator(self)

def iterkeys(self):
"""
Return an iterator over the keys in the dictionary.
"""
return DictKeysIterator(self)

def iteritems(self):
"""
Return an iterator over key-value pairs.
"""
return DictItemsIterator(self)

def _merge(self, mapping):
"""
Update the dictionary from a mapping.
"""
for key in mapping.keys():
self[key] = mapping[key]

def _from_sequence(self, seq):
for double in seq:
if len(double) != 2:
raise ValueError("{0!r} doesn't have a length of 2".format(
double))
self[double[0]] = double[1]

def _update(self, arg, kwargs):
if arg:
if isinstance(arg, collections.Mapping):
self._merge(arg)
else:
self._from_sequence(arg)
if kwargs:
self._merge(kwargs)

def update(self, arg=None, **kwargs):
"""
Update the dictionary from a mapping or sequence containing key-value
pairs. Any existing values are overwritten.
"""
self._update(arg, kwargs)

def get(self, key, default=0):
"""
Return the value for key if it exists otherwise the default.
"""
try:
return self[key]
except KeyError:
return default

def __len__(self):
return self.used

def __repr__(self):
r = ["{0!r} : {1!r}".format(k, v) for k, v in self.iteritems()]
return "Dict({" + ", ".join(r) + "})"

collections.Mapping.register(Dict)


class DictIterator(object):

def __init__(self, d):
self.d = d
self.used = self.d.used
self.len = self.d.used
self.pos = 0

def __iter__(self):
return self

def next(self):
# Check if the dictionary has been mutated under us.
if self.used != self.d.used:
# Make this state permanent.
self.used = -1
raise RuntimeError("dictionary size changed during interation")
i = self.pos
while i <= self.d.mask and self.d.table[i].value is None:
i += 1
self.pos = i + 1
if i > self.d.mask:
# We're done.
raise StopIteration
self.len -= 1
return self._extract(self.d.table[i])

__next__ = next

def _extract(self, entry):
return getattr(entry, self.kind)

def __len__(self):
return self.len

class DictKeysIterator(DictIterator):
kind = "key"

class DictValuesIterator(DictIterator):
kind = "value"

class DictItemsIterator(DictIterator):

def _extract(self, entry):
return entry.key, entry.value

3 comments:

Jose said...

Is it really that slow? I have made a subclass of dictionary, the update method has to do some special things, I've to choose between updating the normal way and then fixing or rewriting the update method (more precisely _merge in your implementation).

I was trying to choose without implementing and then testing performance. Yep... laziness...

Benjamin said...

@Jose I have no idea. The point was just to show how dictionary works not reimplement. You'd have to do your own benchmarks.

Tahir Ahmed said...

@Benjamin i am trying to run your python dictionary implementation, reason being its easy to test dictionary performance with various hash function, but right now i am facing a problem, i am declaring an object hus=Dict(), now i need to enter key value pair i trid with hus[key]=value and hus.__setitem__(key,value), but in both cases it shows duplicate keys in my dictionary, so wts the proper way to insert in hus dictionary without duplication