Building large matrices faster with coo_matrix in Python

The SciPy python library provides many sparse matrix implementations,  including the dok_matrix and coo_matrix classes. These two formats are typically used for incrementally building a sparse matrix.
A dok_matrix (Dictionary Of Keys) is implemented as a hashtable structure, mapping each (row, column) key to a non-zero value.  A coo_matrix (COOrdinate) consists of three arrays rows, cols and data, containing the row indices, column indices, and the values, such that for the i-th non-zero element,  M[rows[i], cols[i]] = data[i].

Though slightly harder to build, a coo_matrix is much faster than a dok_matrix. In this example, I show how the coo_matrix outperforms the dok_matrix in loading a 8 million entries graph edge-list file in NCOL format, with the coo_matrix being about 7 times faster than the dok_matrix.

```import csv
import numpy as np
from scipy.sparse import coo_matrix, dok_matrix
import timeit

def load_coo_matrix(file_path, separator=" ", transpose=True, default_weight=1.):
with open(file_path) as file:

# data structures for the coo_matrix
data = []
rows = []
cols = []

nodes = 0
for row in iterator:
if row:
source, target = int(row), int(row)
nodes = max(nodes, source, target)
rows.append(source)
cols.append(target)
if len(row) > 2:
weight = float(row)
else:
weight = default_weight

data.append(weight)
nodes += 1

# create and return the coo_matrix
data = np.array(data, copy=False)
rows = np.array(rows, dtype=np.int32, copy=False)
cols = np.array(cols, dtype=np.int32, copy=False)
m = coo_matrix((data, (rows, cols) if not transpose else (cols, rows)), shape=(nodes, nodes))
return m

def load_dok_matrix(file_path, max_dim, separator=" ", transpose=True, default_weight=1.):
with open(file_path) as file:
m = dok_matrix((max_dim, max_dim))

dim = 0
if row:
source, target = int(row), int(row)
dim = max(dim, source, target)
if len(row) > 2:
weight = float(row)
else:
weight = default_weight

if transpose:
m[target, source] = weight
else:
m[[source, target]] = weight

dim += 1
return m[:dim, :dim]

def benchmark_coo_matrix():

def benchmark_dok_matrix():

if __name__ == "__main__":
LOOPS = 3
coo_matrix_time = timeit.timeit("benchmark_coo_matrix()", setup='from __main__ '
'import benchmark_coo_matrix',   number=LOOPS) / LOOPS

print("coo_matrix:", "%.2f" % coo_matrix_time, "sec")

dok_matrix_time = timeit.timeit("benchmark_dok_matrix()", setup='from __main__'
' import benchmark_dok_matrix', number=LOOPS) / LOOPS

print("dok_matrix:", "%.2f" % dok_matrix_time, "sec")
```

Output

```coo_matrix: 35.93 sec
dok_matrix: 266.86 sec```