I'm working with some very large, very sparse matrices that I'd like to dump to disk to avoid refreshing each time from a database. CSC matrices can be saved using scipy.io.savemat which is fine, but if you need to make further adjustments after loading it's extremely slow to convert these large matrices back to LIL format. The following code might help anyone in this position; I'm using PyTables to render the guts of a LIL representation to disk, allowing restoration directly to a LIL matrix.
def save ( D, fname ):
"""
Save sparse matrix D of CSC, CSR or LIL format
"""
import tables as pt
fd = pt.openFile ( fname, mode = 'w' )
try:
info = fd.createGroup ( '/', 'info' )
fd.createArray ( info, 'dtype', D.dtype.str )
fd.createArray ( info, 'shape', D.shape )
fd.createArray ( info, 'format', D.format )
data = fd.createGroup ( '/', 'data' )
if D.format in [ 'csc', 'csr' ]:
fd.createArray ( data, 'data', D.data )
fd.createArray ( data, 'indptr', D.indptr )
fd.createArray ( data, 'indices', D.indices )
elif D.format in [ 'lil' ]:
vld = fd.createVLArray ( data, 'data',
pt.Float64Atom(),
expectedsizeinMB = 1000 )
vlr = fd.createVLArray ( data, 'rows',
pt.UInt32Atom(),
expectedsizeinMB = 1000 )
for u in xrange ( D.shape [ 0 ] ):
vld.append ( D.data [ u ] )
vlr.append ( D.rows [ u ] )
else:
print D.format
raise ValueError
except:
print 'Matrix not in CSR/CSC/LIL format ...'
fd.close()
raise
fd.close()
def load ( fname ):
"""
Load sparse matrix of CSC, CSR or LIL format
"""
import tables as pt
builds = { 'csr' : sparse.csr_matrix,
'csc' : sparse.csc_matrix,
'lil' : sparse.lil_matrix }
fd = pt.openFile ( fname, mode = "r" )
info = fd.root.info
data = fd.root.data
format = info.format.read()
if not isinstance ( format, str ):
format = format [ 0 ]
dtype = info.dtype.read()
if not isinstance ( dtype, str ):
dtype = dtype [ 0 ]
build = builds [ format ]
if format in [ 'csc', 'csr' ]:
D = build ( ( data.data.read(),
data.indices.read(),
data.indptr.read() ),
dims = info.shape.read(),
dtype = dtype )
elif format in [ 'lil' ]:
D = build ( info.shape.read() )
D.data = array ( data.data.read(),
dtype='object' )
D.rows = array ( data.rows.read(),
dtype='object' )
else:
print format
fd.close()
raise ValueError
fd.close()
return D
1 comment:
This is a really cool solution to a problem I've been having with optimally dealing with and storing some fairly large sparse vectors. Thanks for sharing!
Post a Comment