Python example

This example script reads in a comma seperated values file (Example file: helmi200.csv.) and outputs it to a hdf5 file that can be read by veax. Since writing the rows individually is quite slow, the rows are written in batches.

Example file: helmi200.csv

# -*- coding: utf-8 -*-
import h5py
import sys
import numpy

h5file = h5py.File("example.hdf5", "w")

h5columns = h5file.create_group("columns") # vaex reads all datasets in the columns group

csv_file = open(sys.argv[1])

# first count the lines, start at -1 since the first line is assumed to contain the column names
line_count = -1
for line in csv_file:
	line_count += 1

print "file contains", line_count, "rows"

csv_file.seek(0) # start from the beginning of the file again
lines = iter(csv_file) # explicitly create an iterator over the lines

# first line should contain the column names
header = lines.next()
columns = header.strip().split(",")
print "columns", columns

# assume all values are floats
Nbatch = 10000
h5_datasets = []
numpy_arrays = []
for column_name in columns:
	dataset = h5columns.create_dataset(column_name, (line_count, ), dtype='f8')
	numpy_arrays.append(numpy.zeros((Nbatch, ), dtype='f8'))

row = 0
# we read in Nbatch lines at a time, and then write them out
for line in lines:
	# convert line to a series of float values
	values = map(float, line.split(","))
	for i in range(len(columns)):
		#h5_datasets[i][row] = values[i]
		index = row-int(row/Nbatch)*Nbatch
		numpy_arrays[i][index] = values[i]
	if ((row % 10000) == 0) and row > 0:
		print "at", row, "of", line_count
		# write out the array to disk
		for i in range(len(columns)):
			start = (int(row/Nbatch)-1)*Nbatch
			end = (int(row/Nbatch))*Nbatch
			h5_datasets[i][start:end] = numpy_arrays[i][:]
	row += 1
if (row % 10000) > 0:
	print "writing out last part"
	for i in range(len(columns)):
		start = (int(row/Nbatch))*Nbatch
		end = line_count
		h5_datasets[i][start:end] = numpy_arrays[i][:end-start]

IDL example

PRINT, 'convert ascii file to hdf5'
testfile = '/Users/users/breddels/gavi/src/SubspaceFinding/data/helmi2000.asc'
h5file_id = H5F_CREATE('/tmp/test.hdf5')

N = 3300000; nr of rows

h5group_columns = H5G_CREATE(h5file_id, "columns") ; for vaex, all columns should be grouped under columns
h5type_id = H5T_IDL_CREATE(1.0d) ; create double datatype
h5data_id = H5S_CREATE_SIMPLE(N)

h5_E = H5D_CREATE(h5group_columns, 'E', h5type_id, h5data_id)
h5_L = H5D_CREATE(h5group_columns, 'L', h5type_id, h5data_id)
h5_Lz = H5D_CREATE(h5group_columns, 'Lz', h5type_id, h5data_id)

dataspace = H5D_GET_SPACE(h5_E)

OPENR, 1, testfile

index = 0L
  READF, 1, E,L,Lz
  if (index MOD 100000) EQ 0 then  begin
    print, index, ' of',N 
  H5S_SELECT_HYPERSLAB, dataspace, [index], [1], stride=[1], /RESET
  memory_space_id = H5S_CREATE_SIMPLE([1])
  H5D_WRITE, h5_E, [E], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  H5D_WRITE, h5_L, [L], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  H5D_WRITE, h5_Lz, [Lz], MEMORY_SPACE_ID=memory_space_id,  FILE_SPACE_ID=dataspace
  index = index + 1

H5F_CLOSE, h5file_id

C example

compile as: gcc -Wall -std=c99 -o ascii_to_hdf5 ascii_to_hdf5.c -lhdf5
run as: ./ascii_to_hdf5 example.hdf5 ../../data/helmi2000-header.asc 3300000 3
	arguments are: output filename, input filename, rows, columns

#include "hdf5.h"
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h> 
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#define MAX_COLUMNS 512

char column_names[MAX_COLUMNS][512];

static void
check (int test, const char * message, ...)
	if (test) {
		va_list args;
		va_start (args, message);
		vfprintf (stderr, message, args);
		va_end (args);
		fprintf (stderr, "\n");
		exit (EXIT_FAILURE);

int main(int argc, char *argv[])
	hid_t		file;    /* Handles */
	herr_t		status;
	haddr_t		offsets[MAX_COLUMNS];
	hsize_t		dims[1];
	char* filename_output = argv[1];
	char* filename_input = argv[2];
	FILE* file_input = fopen(filename_input, "r");

	int no_rows = atoi(argv[3]);
	int no_columns = atoi(argv[4]);
	dims[0] = no_rows;

	// create the file and the group 'columns', which vaex will expect
	file = H5Fcreate(filename_output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
	hid_t group = H5Gcreate1(file, "columns", 0);
	// find the column names in the first line
	for(int i=0; i<no_columns; i++) {
		fscanf(file_input," %s", column_names[i]);
		printf("column[%d]: %s\n", i, column_names[i]);
	fscanf(file_input," \n");

	// just create the dataspace using the HDF5 library, and ask for the offset from the beginning of the file
	for(int i = 0; i < no_columns; i++)  {
		hid_t space = H5Screate_simple(1, dims, NULL);

		hid_t dcpl = H5Pcreate (H5P_DATASET_CREATE);
		H5Pset_layout (dcpl, H5D_CONTIGUOUS); // compact allows us the memory map the file
		H5Pset_alloc_time(dcpl, H5D_ALLOC_TIME_EARLY); // need this to allocate the space so offset exists
		hid_t dset = H5Dcreate(group, column_names[i], H5T_IEEE_F64LE, space, H5P_DEFAULT, dcpl, H5P_DEFAULT);
		offsets[i] = H5Dget_offset(dset);
		H5D_space_status_t space_status;
		H5Dget_space_status(dset, &space_status);
		printf("offset[%d] = %x allocated: %s\n", i, (unsigned int)offsets[i], (space_status == H5D_SPACE_STATUS_ALLOCATED ? "yes" : "no"));

		status = H5Dclose (dset);
		status = H5Pclose (dcpl);
		status = H5Sclose (space);
	//close the group and file
	status = H5Fclose (file);
	// now we can simpy memory map the file (meaning we tread the file as one big 'array'
	// the offsets will tell us where we can write the columns
	struct stat s;
	status = stat(filename_output,  &s);
	check (status < 0, "stat %s failed: %s", filename_output, strerror (errno));
	printf("file size: %lld\n", (unsigned long long)s.st_size);
	int fd = open(filename_output, O_RDWR);
	check (fd < 0, "open %s failed: %s", filename_output, strerror (errno));
	// the mapped pointer points to the beginning of the file
	char* mapped = mmap (0, s.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	check (mapped == MAP_FAILED, "mmap %s failed: %s",
           filename_output, strerror (errno));

	// read in the rows, and directly write them to the file
	for(int j=0; j<no_rows; j++) {
		for(int i=0; i<no_columns; i++) {
			double* column_ptr = (double*)(mapped+offsets[i]);
			fscanf(file_input," %lf", &column_ptr[j]);
		if( ((j % 100000) == 0) & (j > 0) )
			printf("%d of %d\n", j, no_rows);