Friday 13 June 2008

Sparse matrices, SciPy and PyTables

The sparse module in SciPy is useful for dealing efficiently with large sparse matrices in Python. There are currently five formats tailored for different purposes: while constructing the matrix it's recommended to use the linked list (LIL) representation; when it's ready to use, we convert the matrix to either compressed sparse column (CSC) or compressed sparse row (CSR) formats. These latter two allow more efficient calculations (matrix-vector products, linear algebra etc.) but only permit slow updates.

I'm working with some very large, very sparse matrices that I'd like to dump to disk to avoid refreshing each time from a database. CSC matrices can be saved using scipy.io.savemat which is fine, but if you need to make further adjustments after loading it's extremely slow to convert these large matrices back to LIL format. The following code might help anyone in this position; I'm using PyTables to render the guts of a LIL representation to disk, allowing restoration directly to a LIL matrix.

def save ( D, fname ):
"""
Save sparse matrix D of CSC, CSR or LIL format
"""
import tables as pt

fd = pt.openFile ( fname, mode = 'w' )

try:
info = fd.createGroup ( '/', 'info' )
fd.createArray ( info, 'dtype', D.dtype.str )
fd.createArray ( info, 'shape', D.shape )
fd.createArray ( info, 'format', D.format )

data = fd.createGroup ( '/', 'data' )
if D.format in [ 'csc', 'csr' ]:
fd.createArray ( data, 'data', D.data )
fd.createArray ( data, 'indptr', D.indptr )
fd.createArray ( data, 'indices', D.indices )
elif D.format in [ 'lil' ]:
vld = fd.createVLArray ( data, 'data',
pt.Float64Atom(),
expectedsizeinMB = 1000 )
vlr = fd.createVLArray ( data, 'rows',
pt.UInt32Atom(),
expectedsizeinMB = 1000 )
for u in xrange ( D.shape [ 0 ] ):
vld.append ( D.data [ u ] )
vlr.append ( D.rows [ u ] )
else:
print D.format
raise ValueError

except:
print 'Matrix not in CSR/CSC/LIL format ...'
fd.close()
raise

fd.close()


def load ( fname ):
"""
Load sparse matrix of CSC, CSR or LIL format
"""
import tables as pt

builds = { 'csr' : sparse.csr_matrix,
'csc' : sparse.csc_matrix,
'lil' : sparse.lil_matrix }

fd = pt.openFile ( fname, mode = "r" )
info = fd.root.info
data = fd.root.data

format = info.format.read()
if not isinstance ( format, str ):
format = format [ 0 ]

dtype = info.dtype.read()
if not isinstance ( dtype, str ):
dtype = dtype [ 0 ]

build = builds [ format ]

if format in [ 'csc', 'csr' ]:
D = build ( ( data.data.read(),
data.indices.read(),
data.indptr.read() ),
dims = info.shape.read(),
dtype = dtype )
elif format in [ 'lil' ]:
D = build ( info.shape.read() )
D.data = array ( data.data.read(),
dtype='object' )
D.rows = array ( data.rows.read(),
dtype='object' )
else:
print format
fd.close()
raise ValueError

fd.close()
return D

1 comment:

Edward said...

This is a really cool solution to a problem I've been having with optimally dealing with and storing some fairly large sparse vectors. Thanks for sharing!