function      [data, result]= readtext(text, delimiter, comment, quotes, options)

%  Usage:     [data, result]= readtext(source, delimiter, comment, quotes, options)
% 
% Whatever text (file) you give it, readtext returns an array of the contents (or send me a 
%   bug report). Matlab can't read variable length lines or variable type values with the standard 
%   library. readtext can read any text file. Any string (or even regexp) can be delimiting, 
%   default is a comma. Everything after (and including) a comment character, until the line end, 
%   is ignored. Quote characters may also be given, everything between them is treated as one item. 
%   There are options to control what will be converted to numbers and how empty items are saved. 
% 
% If you find any errors, please let me know: peder at axensten dot se.
% 
% source:     the file to be read. May be a file path or just the file name.
%         OR: The text itself, see 'textsource', below. 
% 
% delimiter:  (default: ',') any non-empty string. May be a regexp, but this is slow on large files. 
% 
% comment:    (default: '') zero or one character. Anything after (and including) this character, 
%   until the end of the line, will be ignored. 
% 
% quotes:     (default: '') zero, one (opening quote equals closing), or two characters (opening 
%   and closing quote) to be treated as paired braces. Everything between the quotes will be 
%   treated as one item. The quotes will remain. Quotes may be nested.
% 
% options:    (default: '') may contain (concatenate combined options): 
% - 'textsource': source contains the actual text to be processed, not the file name.
% - 'textual': no numeric conversion ('data' is a cell array of strings only), 
% - 'numeric': everything is converted to a number or NaN ('data' is a numeric array, empty items 
%   are converted to NaNs unless 'empty2zero' is given), 
% - 'empty2zero': an empty field is saved as zero, and 
% - 'empty2NaN': an empty field is saved as NaN. 
% - 'usewaitbar': call waitbar to report progress. If you find the wait bar annoying, get 'waitbar 
%   alternative' at http://www.mathworks.com/matlabcentral/fileexchange/loadFile.do?objectId=11398
% 
% data:       A cell array containing the read text, divided into cells by delimiter and line 
%   endings. 'data' will be empty if the file is not found, could not be opened, or is empty. 
%   With the option 'numeric', 'data' will be a numeric array, with 'textual', 'data' will be a 
%   cell array of strings only, and otherwise it will be a mixed cell array. For Matlab < version 7, 
%   returned strings may contain leading white-space.
% 
% result:     a structure:
% .min: minimum number of columns found in a line.
% .max: number of columns in 'data', before removing empty columns.
% .rows: number of rows in 'data', before removing empty rows. 
% .numberMask: true, if numeric conversion ('NaN' converted to NaN counts).
% .number: number of numeric conversions ('NaN' converted to NaN counts).
% .emptyMask: true, if empty item in file.
% .empty: number of empty items in file.
% .stringMask: true, if non-number and non-empty.
% .string: number of non-number, non-empty items.
% .quote: number of quotes. 
% 
% EXAMPLE 1:  [a,b]= readtext('txtfile', '[,\t]', '#', '"', 'numeric-empty2zero')
% This will load the file 'txtfile' into variable a, treating any of tab or comma as
%   delimiters. Everything from and including # to the next newline will be ignored. 
%   Everything between two double quotes will be treated as a string. Everything will 
%   be converted to numbers and a numeric array returned. Non-numeric items will become 
%   NaNs and empty items are converted to zero. 
% 
% EXAMPLE 2:  a= readtext('The, actual, text, to, process', ',', '', '', 'textsource')
% This will process the actual text string, returning a cell string of the five words. 
% 
% COPYRIGHT (C) Peder Axensten (peder at axensten dot se), 2006-2007.

% INSPIRATION: loadcell.m (id 1965). The goal of readtext is to be at least as flexible (you be 
%   the judge) and quicker. On my test file (see below), readtext is about 3--4 times 
%   as quick, maybe even more on large files. In readtext you may use a regexp as 
%   delimiter and it can ignore comments in the text file. 
% 
% SPEED:      Reading a 1MB file (150000 items!) with 'numeric' takes about 100 seconds on a 
%   fairly slow system. Time scales approximately linearly with input file size. 
% - Conversion from string to numeric is slow (I can't do anything about this), but using the 
%   option 'textual' is a lot quicker (above case takes 12 seconds).
% - Using a regexp delimiter is slower (during initializing), it adds 250 seconds! 
% 
% HISTORY:
% Version 1.0, 2006-05-03.
% Version 1.1, 2006-05-07:
% - Made 'options' case independent. 
% - Removed result.errmess -- now use error(errmess) instead. 
% - Removed result.nan -- it was equivalent to result.string, so check this instead.
% - Added some rows', 'result' fields: 'numberMask', 'emptyMask', and 'stringMask' 
%   (see 'result:', above).
% - A few small bug fixes.
% Version 1.2, 2006-06-06:
% - Now works in Matlab 6.5.1 (R13SP1) (maybe 6.5 too), versions <6.5 will NOT work.
% Version 1.3, 2006-06-20:
% - Better error report when file open fails. 
% - Somewhat quicker. 
% - Recommends 'waitbar alternative'. Ok with Matlab orig. waitbar too, of course. 
% Version 1.4, 2006-07-14:
% - Close waitbar instead of deleting it, and some other minor waitbar compatibility fixes. 
% Version 1.5, 2006-08-13:
% - No more (La)TeX formatting of file names. 
% - Prefixed waitbar messages with '(readtext)'. 
% Version 1.6, 2006-10-02:
% - Better removal of comments. Could leave an empty first row before. 
% - Added a 'usewaitbar' option. 
% - Now removes empty last columns and rows. 
% Version 1.7, 2006-01-04:
% - Quicker in 'mixed' and 'numeric' modes. It's still the numeric conversion that's slow. 
% - Made newline handling more robust. 
% - Made numeric conversion more robust (now ignores leading space chars). 
% - Now emits an error if delimiter is empty.
% - Added some stuff to the help text.
% - Simplified code somewhat. 
% Version 1.8, 2007-03-08:
% - Fixed a problem when the comment character was a regexp character, such as '*'. 
% - Fixed a problem when reading files with no data. 
% Version 1.9, 2007-03-26:
% - Can now handle a string directly, not just files. See 'textsource', in help readtext.
% - Improved compatibility with Matlab 6.x regarding waitbar calls. 
% - Fixed a bug where readtext failed to recognize a quote if there was only one in the text.
% - Fixed a bug when processing source of delimiters only.
% - Help text: added an example, edited some in general.
% 
% TO DO:
% - Add result.quoteMask.
% - Add 'removeemptycolumns' and 'removeemptyrows' options. 
% - Use optional id/value pairs for options:
%   * 'Delimiter', <string>                  (default is ',')
%   * 'RegExpDelimiter', <regexp>            (default is '')
%   * 'CommentChar', <string>                (default is '')
%   * 'Quote', ''/<one char>/<two chars>     (default is '')
%   * 'Output', 'Mixed'/'Textual'/'Numeric'  (default is 'Mixed')
%   * 'EmptyNum', NaN/Inf/-Inf/<number>      (default is NaN for 'mixed' and '' for 'textual')
%   * 'WaitBar', 'off'/'on'                  (default is 'off')
%   'Delimiter' and 'RegExpDelimiter' are exclusive. 

% KEYWORDS:     import, read, load, text, delimited, cell, numeric, array, flexible
% 
% REQUIREMENTS: Matlab 7.0.1 (R14SP1).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	
	% Read (or set to default) the input arguments.
	if((nargin < 1) || ~ischar(text))		% Is there a file name?
		error('First argument must be the source!'); 
	end
	opts.delimiter=		',';									% Default delimiter value.
	opts.comment=		'';										% Default comment value.
	opts.quotes=		'';										% Default quotes value.
	opts.file=			true;									% Default is file name in source.
	opts.format=		'mixed';								% Default format value.
	opts.op_empty=		[];										% Ignore empties. 
	opts.fname=			'text';
	if(nargin >= 2), opts.delimiter=	delimiter;		end
	if(nargin >= 3), opts.comment=		comment;		end
	if(nargin >= 4), opts.quotes=		quotes;			end
	if(nargin >= 5), options=			options;
	else			 options=			'';
	end
	
	if    (~ischar(opts.delimiter) || isempty(opts.delimiter))
		error('Argument ''delimiter'' must be a non-empty string.');
	elseif(~ischar(opts.comment)   || (length(opts.comment) > 1))
		error('Argument ''comment'' must be a string of maximum one character.');
	elseif(~ischar(opts.quotes)    || (length(opts.quotes) > 2))
		error('Argument ''quotes'' must be a string of maximum two characters.');
	elseif(~ischar(options)   )
		error('Argument ''options'' must be a string.');
	end
	mywaitbar=			@emptywaitbar;							% Default is using no waitbar ...
	th=					[];										% ... so empty waitbar handle.
	options=			lower(options);
	if(~isempty(strfind(options, 'textsource'))),	opts.file= false;			end
	if(~isempty(strfind(options, 'numeric'))),		opts.format= 'numeric';		end
	if(~isempty(strfind(options, 'textual'))),		opts.format= 'textual';		end
	if(~isempty(strfind(options, 'empty2zero'))),	opts.op_empty= 0;	% Replace by zero
	elseif(strcmp(opts.format, 'numeric') || ~isempty(strfind(options, 'empty2nan')))
													opts.op_empty= NaN;	% Replace by NaN.
	end
	if(strcmp(opts.format, 'textual')), opts.op_empty= num2str(opts.op_empty);	end	% Textual 'empty'.
	
	% Set the default return values.
	result.min=		Inf;
	result.max=		0;
	result.quote=	0;
	
	% Read the file.
	if(opts.file)
		opts.fname=		text;
		[fid, errmess]=	fopen(text, 'r');							% Open the file.
		if(fid < 0), error(['Trying to open ''' opts.fname ''': ' errmess]); end
		text=			transpose(fread(fid, 'uchar=>char'));		% Read the file.
		fclose(fid);												% Close the file.
	end
	
	if(~isempty(strfind(options, 'usewaitbar')))
		mywaitbar=		@waitbar;
		th= 			mywaitbar(0, '(readtext) Initialising...');% Show waitbar.
		set(findall(th, '-property', 'Interpreter'), 'Interpreter', 'none');% No (La)TeX formatting. 
	end
	
	% Clean up the text.
	eol=			char(10);									% Using unix-style eol. 
	text=			strrep(text, [char(13) char(10)], eol);		% Replace Windows-style eol.
	text=			strrep(text, char(13), eol);				% Replace MacClassic-style eol.
	if(~isempty(opts.comment))									% Remove comments.
		text=	regexprep(text, ['^\' opts.comment '[^' eol ']*' eol], '');	% Remove entire commented lines. 
		text=	regexprep(text, [ '\' opts.comment '[^' eol ']*'], '');		% Remove commented line endings. 
	end
	if(isempty(text) || text(end) ~= eol),	text= [text eol];	end	% End string with eol, if none.
	
	% Find column and row dividers.
	opts.delimiter=		strrep(opts.delimiter, '\t', char( 9));	% Convert to one char, quicker?
	opts.delimiter=		strrep(opts.delimiter, '\n', char(10));
	opts.delimiter=		strrep(opts.delimiter, '\r', char(13));
	opts.delimiter=		strrep(opts.delimiter, '\f', char(12));
	opts.delimiter=		strrep(opts.delimiter, [char(13) char(10)], eol);% Replace Windows-style eol.
	opts.delimiter=		strrep(opts.delimiter, char(13), eol);	% Replace MacClassic-style eol.
	if(1 == length(opts.delimiter))								% Find column dividers quickly.
		delimS=		find((text == opts.delimiter) | (text == eol));
		delimE=		delimS;
	elseif(isempty(regexp(opts.delimiter, '[\+\*\?\|\[\^\$<>\.\\]', 'once'))) % Find them rather quickly.
		delimS=		strfind(text, opts.delimiter);
		eols=		find(text == eol);
		delimE=		union(eols, delimS + length(opts.delimiter) - 1);
		delimS=		union(eols, delimS);
	else														% Find them with regexp.
		[delimS, delimE]=	regexp(text, [opts.delimiter '|' eol]);
	end
	divRow=			[0, find(text == eol)];						% Find row dividers+last.
	
	% Keep quoted text together.
	if(~isempty(opts.quotes))									% Should we look for quotes?
		if((length(opts.quotes) == 1) || (opts.quotes(1) == opts.quotes(2)))	% Opening char == ending.
			exclE=			find(text == opts.quotes(1));
			exclS=			exclE(1:2:end);
			exclE=			exclE(2:2:end);
		else													% Opening char ~= closing.
			exclS=			find(text == opts.quotes(1));
			exclE=			find(text == opts.quotes(2));
		end
		if((length(exclS) ~= length(exclE)) || any(exclS > exclE))
			close(th);											% Close waitbar or it'll linger.
			error('Opening and closing quotes don''t match in %s.', opts.fname); 
		elseif(~isempty(exclS))									% We do have quoted text.
			mywaitbar(0, th, '(readtext) Doing quotes...');		% Inform user.
			r=		1;
			rEnd=	length(exclS);
			n=		1;
			nEnd=	length(delimS);
			result.quote=	rEnd;
			exclS(end+1)=	0;	% This and next lines are needed in cases with only one quote in text.
			exclE(end+1)=	0;
			while((n <= nEnd) && (r <= rEnd)) % "Remove" delimiters and newlines within quotes.
				while((r <= rEnd) && (delimS(n) > exclE(r))), r= r+1;	end	% Next end-quote after newline.
				while((n <= nEnd) && (delimS(n) < exclS(r))), n= n+1;	end	% Next newline after strart-quote.
				while((n <= nEnd) && (delimS(n) >= exclS(r)) && (delimS(n) <= exclE(r)))
					delimS(n)=	0;											% Newlines inside quote. 
					n=			n+1;
				end
				mywaitbar(n/nEnd);								% Update waitbar.
			end
			mywaitbar(1);
			delimE=	delimE(delimS > 0);
			delimS=	delimS(delimS > 0);
		end
	end
	delimS=		delimS-1;										% Last char before delimiter.
	delimE=		[1 delimE(1:end-1)+1];							% First char after delimiter.
	
	% Do the stuff: convert text to cell (and maybe numeric) array.
	mywaitbar(0, th, sprintf('(readtext) Processing ''%s''...', opts.fname));
	r=				1;
	c=				1;											% Presize data to optimise speed.
	data=			cell(length(divRow), ceil(length(delimS)/(length(divRow)-1)));
	nums=			zeros(size(data));							% Presize nums to optimise speed.
	nEnd=			length(delimS);								% Prepare for a waitbar.
	istextual=		strcmp(opts.format, 'textual');
	for n=1:nEnd
		temp= 			strtrim(text(delimE(n):delimS(n)));		% Textual item.
		data{r, c}= 	temp;									% Textual item.
		if(~istextual)
			lenT=			length(temp);
			if(lenT > 0)
				% Try to get 123, 123i, 123i + 45, or 123i - 45
				[a, count, errmsg, nextindex]=	sscanf(temp, '%f %1[ij] %1[+-] %f', 4);
				if(isempty(errmsg) && (nextindex > lenT))
					if    (count == 1),			nums(r, c)=		a;
					elseif(count == 2),			nums(r, c)=		a(1)*i;
					elseif(count == 4),			nums(r, c)=		a(1)*i + (44 - a(3))*a(4);
					else						nums(r, c)=		NaN;
					end
				elseif(regexpi(temp, '[^0-9eij \.\+\-]', 'once'))	% Remove non-numbers.
					nums(r, c)=		NaN;
				else
					nums(r, c)= 	s2n(temp, lenT);			% Some other kind of complex number.
				end
			else			nums(r, c)=	NaN;
			end
		end
		if(text(delimS(n)+1) == eol)							% Next row.
			result.min=		min(result.min, c);					% Find shortest row.
			result.max=		max(result.max, c);					% Find longest row.
			r=				r+1;
			c=				0;
			if(bitand(r, 15) == 0), mywaitbar(n/nEnd);	end		% Update waitbar.
		end
		c=				c+1;
	end
	
	% Clean up the conversion and do the result statistics.
	mywaitbar(0, th, '(readtext) Cleaning up...');				% Inform user.
	data=				data(1:(r-1), 1:result.max);			% In case we started off to big.
	if(~strcmp(opts.format, 'textual')), nums= nums(1:(r-1), 1:result.max);	end		% In case we started off to big.
	if(all(cellfun('isempty', data)))
		data= 			{};
		r=				1;
		result.min=		0;
		result.max=		0;
	else
		while((size(data, 2) > 1) && all(cellfun('isempty', data(end, :))))	% Remove empty last lines. 
			data=	data(1:end-1, :); 
			nums=	nums(1:end-1, :); 
			r=		r-1;
		end 
		while((size(data, 1) > 1) && all(cellfun('isempty', data(:, end))))	% Remove empty last columns. 
			data=	data(:, 1:end-1); 
			nums=	nums(:, 1:end-1); 
			c=		c-1;
		end 
	end
	result.rows=		r-1;
	empties=			cellfun('isempty', data);				% Find empty items.
	result.emptyMask=	empties;
	if(strcmp(opts.format, 'textual'))
		result.numberMask=	repmat(false, size(data));			% No numbers, all strings.
		result.stringMask=	~empties;							% No numbers, all strings.
		data(empties)=		{opts.op_empty};					% Set correct empty value.
	else
		if(isempty(data)),	result.numberMask=	[];
		else				result.numberMask=	~(isnan(nums) & ~strcmp(data, 'NaN'));	% What converted well.
		end
		if(strcmp(opts.format, 'numeric'))
			nums(empties)=		opts.op_empty;					% Set correct empty value.
			data=				nums;							% Return the numeric array.
			result.stringMask=	~(empties | result.numberMask);	% Didn't convert well: so strs.
		else
			data(result.numberMask)= num2cell(nums(result.numberMask));	% Copy back numerics.
			data(empties)=		{opts.op_empty};						% Set correct empty value.
			result.stringMask=	cellfun('isclass', data, 'char');	% Well, the strings.
		end
	end
	result.empty=		sum(result.emptyMask(:));				% Count empties.
	result.numberMask=	result.numberMask & ~result.emptyMask;	% Empties don't count.
	result.number=		sum(result.numberMask(:));				% Count numbers.
	result.stringMask=	result.stringMask & ~result.emptyMask;	% Empties don't count.
	result.string=		sum(result.stringMask(:));				% Count strings.
	
 	close(th);													% Removing the waitbar. 
end


function x= s2n(s, lenS)
	x=		NaN;

	% Try to get 123 + 23i or 123 - 23i
	[a,count,errmsg,nextindex] = sscanf(s,'%f %1[+-] %f %1[ij]',4);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 4),				x=		a(1) + (44 - a(2))*a(3)*i;
		end
		return
	end

	% Try to get i, i + 45, or i - 45
	[a,count,errmsg,nextindex] = sscanf(s,'%1[ij] %1[+-] %f',3);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 1),				x=		i;
		elseif(count == 3),			x=		i + (44 - a(2))*a(3);
		end
		return
	end

	% Try to get 123 + i or 123 - i
	[a,count,errmsg,nextindex] = sscanf(s,'%f %1[+-] %1[ij]',3);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 1),				x=		a(1);
		elseif(count == 3),			x=		a(1) + (44 - a(2))*i;
		end
		return
	end

	% Try to get -i, -i + 45, or -i - 45
	[a,count,errmsg,nextindex] = sscanf(s,'%1[+-] %1[ij] %1[+-] %f',4);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 2),				x=		(44 - a(1))*i;
		elseif(count == 4),			x=		(44 - a(1))*i + (44 - a(3))*a(4);
		end
		return
	end

	% Try to get 123 + 23*i or 123 - 23*i
	[a,count,errmsg,nextindex] = sscanf(s,'%f %1[+-] %f %1[*] %1[ij]',5);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 5),				x=		a(1) + (44 - a(2))*a(3)*i;
		end
		return
	end

	% Try to get 123*i, 123*i + 45, or 123*i - 45
	[a,count,errmsg,nextindex] = sscanf(s,'%f %1[*] %1[ij] %1[+-] %f',5);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 1),				x=		a;
		elseif(count == 3),			x=		a(1)*i;
		elseif(count == 5),			x=		a(1)*i + (44 - a(4))*a(5);
		end
		return
	end

	% Try to get i*123 + 45 or i*123 - 45
	[a,count,errmsg,nextindex] = sscanf(s,'%1[ij] %1[*] %f %1[+-] %f',5);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 1),				x=		i;
		elseif(count == 3),			x=		i*a(3);
		elseif(count == 5),			x=		i*a(3) + (44 - a(4))*a(5);
		end
		return
	end

	% Try to get -i*123 + 45 or -i*123 - 45
	[a,count,errmsg,nextindex] = sscanf(s,'%1[+-] %1[ij] %1[*] %f %1[+-] %f',6);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 2),				x=		(44 - a(1))*i;
		elseif(count == 4),			x=		(44 - a(1))*i*a(4);
		elseif(count == 6),			x=		(44 - a(1))*i*a(4) + (44 - a(5))*a(6);
		end
		return
	end

	% Try to get 123 + i*45 or 123 - i*45
	[a,count,errmsg,nextindex] = sscanf(s,'%f %1[+-] %1[ij] %1[*] %f',5);
	if(isempty(errmsg) && (nextindex > lenS))
		if(count == 5),				x=		a(1) + (44 - a(2))*i*a(5);
		end
		return
	end

	% None of the above cases.
	x=		NaN;

end


function emptywaitbar(varargin)
end