rhumba of rattlesnakes: Sparse matrices, SciPy and PyTables

The sparse module in SciPy is useful for dealing efficiently with large sparse matrices in Python. There are currently five formats tailored for different purposes: while constructing the matrix it's recommended to use the linked list (LIL) representation; when it's ready to use, we convert the matrix to either compressed sparse column (CSC) or compressed sparse row (CSR) formats. These latter two allow more efficient calculations (matrix-vector products, linear algebra etc.) but only permit slow updates.

I'm working with some very large, very sparse matrices that I'd like to dump to disk to avoid refreshing each time from a database. CSC matrices can be saved using scipy.io.savemat which is fine, but if you need to make further adjustments after loading it's extremely slow to convert these large matrices back to LIL format. The following code might help anyone in this position; I'm using PyTables to render the guts of a LIL representation to disk, allowing restoration directly to a LIL matrix.

def save ( D, fname ):
  """
  Save sparse matrix D of CSC, CSR or LIL format
  """
  import tables as pt

  fd = pt.openFile ( fname, mode = 'w' )

  try:
      info = fd.createGroup ( '/', 'info' )
      fd.createArray ( info, 'dtype', D.dtype.str )
      fd.createArray ( info, 'shape', D.shape )
      fd.createArray ( info, 'format', D.format )
    
      data = fd.createGroup ( '/', 'data' )
      if D.format in [ 'csc', 'csr' ]:
          fd.createArray ( data, 'data', D.data )
          fd.createArray ( data, 'indptr', D.indptr )
          fd.createArray ( data, 'indices', D.indices )
      elif D.format in [ 'lil' ]:
          vld = fd.createVLArray ( data, 'data',
                               pt.Float64Atom(),
                        expectedsizeinMB = 1000 )
          vlr = fd.createVLArray ( data, 'rows',
                                pt.UInt32Atom(),
                        expectedsizeinMB = 1000 )
      for u in xrange ( D.shape [ 0 ] ):
          vld.append ( D.data [ u ] )
          vlr.append ( D.rows [ u ] )
      else:
          print D.format
          raise ValueError
    
  except:
      print 'Matrix not in CSR/CSC/LIL format ...'
      fd.close()
      raise

  fd.close()


def load ( fname ):
  """
  Load sparse matrix of CSC, CSR or LIL format
  """
  import tables as pt

  builds = { 'csr' : sparse.csr_matrix,
             'csc' : sparse.csc_matrix,
             'lil' : sparse.lil_matrix }

  fd = pt.openFile ( fname, mode = "r" )
  info = fd.root.info
  data = fd.root.data

  format = info.format.read()
  if not isinstance ( format, str ):
      format = format [ 0 ]
    
  dtype = info.dtype.read()
  if not isinstance ( dtype, str ):
      dtype = dtype [ 0 ]
    
  build = builds [ format ]

  if format in [ 'csc', 'csr' ]:
      D = build ( ( data.data.read(),
                    data.indices.read(),
                    data.indptr.read() ),
                  dims = info.shape.read(),
                  dtype = dtype )
  elif format in [ 'lil' ]:
      D = build ( info.shape.read() )
      D.data = array ( data.data.read(),
                       dtype='object' )
      D.rows = array ( data.rows.read(),
                       dtype='object' )
  else:
      print format
      fd.close()
      raise ValueError

  fd.close()
  return D

rhumba of rattlesnakes

Friday, 13 June 2008

Sparse matrices, SciPy and PyTables

1 comment:

Herein

Archive