Monthly Archives: March 2015

Building large matrices faster with coo_matrix in Python

The SciPy python library provides many sparse matrix implementations,  including the dok_matrix and coo_matrix classes. These two formats are typically used for incrementally building a sparse matrix.
A dok_matrix (Dictionary Of Keys) is implemented as a hashtable structure, mapping each (row, column) key to a non-zero value.  A coo_matrix (COOrdinate) consists of three arrays rows, cols and data, containing the row indices, column indices, and the values, such that for the i-th non-zero element,  M[rows[i], cols[i]] = data[i].

Though slightly harder to build, a coo_matrix is much faster than a dok_matrix. In this example, I show how the coo_matrix outperforms the dok_matrix in loading a 8 million entries graph edge-list file in NCOL format, with the coo_matrix being about 7 times faster than the dok_matrix.

import csv
import numpy as np
from scipy.sparse import coo_matrix, dok_matrix
import timeit

def load_coo_matrix(file_path, separator=" ", transpose=True, default_weight=1.):
    with open(file_path) as file:
        reader = csv.reader(file, delimiter=separator)
        iterator = iter(reader)

        # data structures for the coo_matrix
        data = []
        rows = []
        cols = []

        nodes = 0
        for row in iterator:
            if row:
                source, target = int(row[0]), int(row[1])
                nodes = max(nodes, source, target)
                rows.append(source)
                cols.append(target)
                if len(row) > 2:
                    weight = float(row[2])
                else:
                    weight = default_weight

                data.append(weight)
        nodes += 1

        # create and return the coo_matrix
        data = np.array(data, copy=False)
        rows = np.array(rows, dtype=np.int32, copy=False)
        cols = np.array(cols, dtype=np.int32, copy=False)
        m = coo_matrix((data, (rows, cols) if not transpose else (cols, rows)), shape=(nodes, nodes))
        return m

def load_dok_matrix(file_path, max_dim, separator=" ", transpose=True, default_weight=1.):
    with open(file_path) as file:
        reader = csv.reader(file, delimiter=separator)
        iter_reader = iter(reader)
        m = dok_matrix((max_dim, max_dim))

        dim = 0
        for row in iter_reader:
            if row:
                source, target = int(row[0]), int(row[1])
                dim = max(dim, source, target)
                if len(row) > 2:
                    weight = float(row[2])
                else:
                    weight = default_weight

                if transpose:
                    m[target, source] = weight
                else:
                    m[source, target] = weight

        dim += 1
        return m[:dim, :dim]

def benchmark_coo_matrix():
    load_coo_matrix("C:/datasets/ff_8m")

def benchmark_dok_matrix():
    load_dok_matrix("C:/datasets/ff_8m", 8500000)

if __name__ == "__main__":
    LOOPS = 3
    coo_matrix_time = timeit.timeit("benchmark_coo_matrix()", setup='from __main__ '
                    'import benchmark_coo_matrix',   number=LOOPS) / LOOPS

    print("coo_matrix:", "%.2f" % coo_matrix_time, "sec")

    dok_matrix_time = timeit.timeit("benchmark_dok_matrix()", setup='from __main__'
                    ' import benchmark_dok_matrix', number=LOOPS) / LOOPS
    
    print("dok_matrix:", "%.2f" % dok_matrix_time, "sec")

 

Output

coo_matrix: 35.93 sec
dok_matrix: 266.86 sec