Python bytes: October 2008

For those curious about how CPython's dict implementation works, I've written a Python implementation using the same algorithms. Aside from the education value, it's pretty useless because it doesn't support None as a value and is extremely slow. You can get the source in a Bazaar repo: http://code.python.org/python/users/benjamin.peterson/pydict/
"""
A Python dict implementation.
"""

import collections

MINSIZE = 8
PERTURB_SHIFT = 5
dummy = "<dummy key>"


class Entry(object):
    """
    A hash table entry.

    Attributes:
       * key - The key for this entry.
       * hash - The has of the key.
       * value - The value associated with the key.
    """

    __slots__ = ("key", "value", "hash")

    def __init__(self):
        self.key = None
        self.value = None
        self.hash = 0

    def __repr__(self):
        return "<Entry: key={0} value={1}>".format(self.key, self.value)



class Dict(object):
    """
    A mapping interface implemented as a hash table.

    Attributes:
        * used - The number of entires used in the table.
        * filled - used + number of entries with a dummy key.
        * table - List of entries; contains the actual dict data.
        * mask - Length of table - 1. Used to fetch values.
    """

    __slots__ = ("filled", "used", "mask", "table")


    def __init__(self, arg=None, **kwargs):
        self.clear()
        self._update(arg, kwargs)

    @classmethod
    def fromkeys(cls, keys, value=0):
        """
        Return a new dictionary from a sequence of keys.
        """
        d = cls()
        for key in keys:
            d[key] = value
        return d

    def clear(self):
        """
        Clear the dictionary of all data.
        """
        self.filled = 0
        self.used = 0
        self.mask = MINSIZE - 1
        self.table = []
        # Initialize the table to a clean slate of entries.
        for i in range(MINSIZE):
            self.table.append(Entry())

    def pop(self, *args):
        """
        Remove and return the value for a key.
        """
        have_default = len(args) == 2
        try:
            v = self[args[0]]
        except KeyError:
            if have_default:
                return args[1]
            raise
        else:
            del self[args[0]]
            return v

    def popitem(self):
        """
        Remove and return any key-value pair from the dictionary.
        """
        if self.used == 0:
            raise KeyError("empty dictionary")
        entry0 = self.table[0]
        entry = entry0
        i = 0
        if entry0.value is None:
            # The first entry in the table's hash is abused to hold the index to
            # the next place to look for a value to pop.
            i = entry0.hash
            if i > self.mask or i < i:
                i = 1
            entry = self.table[i]
            while entry.value is None:
                i += 1
                if i > self.mask:
                    i = 1
                entry = self.table[i]
        res = entry.key, entry.value
        self._del(entry)
        # Set the next place to start.
        entry0.hash = i + 1
        return res

    def setdefault(self, key, default=0):
        """
        If key is in the dictionary, return it. Otherwise, set it to the default
        value.
        """
        val = self._lookup(key).value
        if val is None:
            self[key] = default
            return default
        return val

    def _lookup(self, key):
        """
        Find the entry for a key.
        """
        key_hash = hash(key)
        i = key_hash & self.mask
        entry = self.table[i]
        if entry.key is None or entry is key:
            return entry
        free = None
        if entry.key is dummy:
            free = entry
        elif entry.hash == key_hash and key == entry.key:
            return entry

        perturb = key_hash
        while True:
            i = (i << 2) + i + perturb + 1;
            entry = self.table[i & self.mask]
            if entry.key is None:
                return entry if free is None else free
            if entry.key is key or \
                    (entry.hash == key_hash and key == entry.key):
                return entry
            elif entry.key is dummy and free is None:
                free = dummy
            perturb >>= PERTURB_SHIFT

        assert False, "not reached"

    def _resize(self, minused):
        """
        Resize the dictionary to at least minused.
        """
        newsize = MINSIZE
        # Find the smalled value for newsize.
        while newsize <= minused and newsize > 0:
            newsize <<= 1
        oldtable = self.table
        # Create a new table newsize long.
        newtable = []
        while len(newtable) < newsize:
            newtable.append(Entry())
        # Replace the old table.
        self.table = newtable
        self.used = 0
        self.filled = 0
        # Copy the old data into the new table.
        for entry in oldtable:
            if entry.value is not None:
                self._insert_into_clean(entry)
            elif entry.key is dummy:
                entry.key = None
        self.mask = newsize - 1

    def _insert_into_clean(self, entry):
        """
        Insert an item in a clean dict. This is a helper for resizing.
        """
        i = entry.hash & self.mask
        new_entry = self.table[i]
        perturb = entry.hash
        while new_entry.key is not None:
            i = (i << 2) + i + perturb + 1
            new_entry = self.table[i & self.mask]
            perturb >>= PERTURB_SHIFT
        new_entry.key = entry.key
        new_entry.value = entry.value
        new_entry.hash = entry.hash
        self.used += 1
        self.filled += 1

    def _insert(self, key, value):
        """
        Add a new value to the dictionary or replace an old one.
        """
        entry = self._lookup(key)
        if entry.value is None:
            self.used += 1
            if entry.key is not dummy:
                self.filled += 1
        entry.key = key
        entry.hash = hash(key)
        entry.value = value

    def _del(self, entry):
        """
        Mark an entry as free with the dummy key.
        """
        entry.key = dummy
        entry.value = None
        self.used -= 1

    def __getitem__(self, key):
        value = self._lookup(key).value
        if value is None:
            # Check if we're a subclass.
            if type(self) is not Dict:
                # Try to call the __missing__ method.
                missing = getattr(self, "__missing__")
                if missing is not None:
                    return missing(key)
            raise KeyError("no such key: {0!r}".format(key))
        return value

    def __setitem__(self, key, what):
        # None is used as a marker for empty entries, so it can't be in a
        # dictionary.
        assert what is not None and key is not None, \
            "key and value must not be None"
        old_used = self.used
        self._insert(key, what)
        # Maybe resize the dict.
        if not (self.used > old_used and
                self.filled*3 >= (self.mask + 1)*2):
            return
        # Large dictionaries (< 5000) are only doubled in size.
        factor = 2 if self.used > 5000 else 4
        self._resize(factor*self.used)

    def __delitem__(self, key):
        entry = self._lookup(key)
        if entry.value is None:
            raise KeyError("no such key: {0!r}".format(key))
        self._del(entry)

    def __contains__(self, key):
        """
        Check if a key is in the dictionary.
        """
        return self._lookup(key).value is not None

    def __eq__(self, other):
        if not isinstance(other, Dict):
            try:
                # Try to coerce the other to a Dict, so we can compare it.
                other = Dict(other)
            except TypeError:
                return NotImplemented
        if self.used != other.used:
            # They're not the same size.
            return False
        # Look through the table and compare every entry, breaking out early if
        # we find a difference.
        for entry in self.table:
            if entry.value is not None:
                try:
                    bval = other[entry.key]
                except KeyError:
                    return False
                if not bval == entry.value:
                    return False
        return True

    def __ne__(self, other):
        return not self == other

    def keys(self):
        """
        Return a list of keys in the dictionary.
        """
        return [entry.key for entry in self.table if entry.value is not None]

    def values(self):
        """
        Return a list of values in the dictionary.
        """
        return [entry.value for entry in self.table if entry.value is not None]

    def items(self):
        """
        Return a list of key-value pairs.
        """
        return [(entry.key, entry.value) for entry in self.table
                if entry.value is not None]

    def __iter__(self):
        return DictKeysIterator(self)

    def itervalues(self):
        """
        Return an iterator over the values in the dictionary.
        """
        return DictValuesIterator(self)

    def iterkeys(self):
        """
        Return an iterator over the keys in the dictionary.
        """
        return DictKeysIterator(self)

    def iteritems(self):
        """
        Return an iterator over key-value pairs.
        """
        return DictItemsIterator(self)

    def _merge(self, mapping):
        """
        Update the dictionary from a mapping.
        """
        for key in mapping.keys():
            self[key] = mapping[key]

    def _from_sequence(self, seq):
        for double in seq:
            if len(double) != 2:
                raise ValueError("{0!r} doesn't have a length of 2".format(
                        double))
            self[double[0]] = double[1]

    def _update(self, arg, kwargs):
        if arg:
            if isinstance(arg, collections.Mapping):
                self._merge(arg)
            else:
                self._from_sequence(arg)
        if kwargs:
            self._merge(kwargs)

    def update(self, arg=None, **kwargs):
        """
        Update the dictionary from a mapping or sequence containing key-value
        pairs. Any existing values are overwritten.
        """
        self._update(arg, kwargs)

    def get(self, key, default=0):
        """
        Return the value for key if it exists otherwise the default.
        """
        try:
            return self[key]
        except KeyError:
            return default

    def __len__(self):
        return self.used

    def __repr__(self):
        r = ["{0!r} : {1!r}".format(k, v) for k, v in self.iteritems()]
        return "Dict({" + ", ".join(r) + "})"

collections.Mapping.register(Dict)


class DictIterator(object):

    def __init__(self, d):
        self.d = d
        self.used = self.d.used
        self.len = self.d.used
        self.pos = 0

    def __iter__(self):
        return self

    def next(self):
        # Check if the dictionary has been mutated under us.
        if self.used != self.d.used:
            # Make this state permanent.
            self.used = -1
            raise RuntimeError("dictionary size changed during interation")
        i = self.pos
        while i <= self.d.mask and self.d.table[i].value is None:
            i += 1
        self.pos = i + 1
        if i > self.d.mask:
            # We're done.
            raise StopIteration
        self.len -= 1
        return self._extract(self.d.table[i])

    __next__ = next

    def _extract(self, entry):
        return getattr(entry, self.kind)

    def __len__(self):
        return self.len

class DictKeysIterator(DictIterator):
    kind = "key"

class DictValuesIterator(DictIterator):
    kind = "value"

class DictItemsIterator(DictIterator):

    def _extract(self, entry):
        return entry.key, entry.value
Sunday, October 19, 2008

Pure Python Dictionary Implementation

Monday, October 13, 2008

First impressions of darcs

Wednesday, October 1, 2008

Python 2.6 released!

Blog Archive