MATLAB: Parsing a text file in matlab and accessing contents of each sections

access contentparsingregexptext file

Hi I want to separate a text file into different sections in MATLAB which is quite big.

 - Ignore first set of lines
 - Then the data set is repeated
 - Access its content for a particular set of condition

For example, for a drag factor of 1.0 and fuel factor of 1.2, I want to find the corresponding alt for a particular weight.

Find attached the text file.

Thanks Yashvin

Best Answer

Here is a function, which reads question2.txt and returns a struct vector. It might serve as a starting point.

>> out = cssm()
out = 
1x2 struct array with fields:
    DRAG_FACTOR
    FUEL_FACTOR
    Table
>> out(abs([out.DRAG_FACTOR]-1)<1e-6 & abs([out.FUEL_FACTOR]-1)<1e-6).Table(1:5,1:3)
ans =
   1.0e+04 *
    4.0000    0.0000    0.0211
    4.0500    0.0000    0.0212
    4.1000    0.0000    0.0213
    4.1500    0.0000    0.0214
    4.2000    0.0000    0.0215

where

function    out = cssm() 
      str = fileread( 'question2.txt' );
      section_separator = 'CLEAN CONFIGURATION';
      cac = strsplit( str, section_separator );
      len = length( cac );
      out = struct( 'DRAG_FACTOR',nan(1,len-1),  'FUEL_FACTOR',[], 'Table',[] );
      for jj = 2 : len
          out(jj-1) = handle_one_section_( cac{jj} );
      end
  end
  function    sas = handle_one_section_( str )
      sas = struct( 'DRAG_FACTOR',[],  'FUEL_FACTOR',[], 'Table',[] );
      sas.DRAG_FACTOR = excerpt_num_( str, 'DRAG FACTOR' );
      sas.FUEL_FACTOR = excerpt_num_( str, 'FUEL FACTOR' );
      sas.Table = excerpt_table_( str );
  end
  function  val = excerpt_num_( str, name )
      buf = regexp( str, [ '(?<=', name, ')', '[ ]+[\d\.]+' ], 'match', 'once' );
      val = str2double( buf );
  end
  function  val = excerpt_table_( str )
      %   Q&D, quick and dirty, search a numerical sequence, which is at least 100 character 
      %   long. PROBLEM: requires that the preceding line ends with a "non-numerical" 
      %   character and that the following line begins with a "non-numerical" character. 
      buf = regexp( str, '[\d\.\s]{100,}', 'match', 'once' );
      val = str2num( buf );
  end

&nbsp

Modified function based on comment

>> cssm
ans = 
1x2 struct array with fields:
    DRAG_FACTOR
    FUEL_FACTOR
    Table
    COST_INDEX
    ALTITUDE
    ISA

where

function    out = cssm() 
      str = fileread( 'question2.txt' );
      section_separator = 'CLEAN CONFIGURATION';
      cac = strsplit( str, section_separator );
      len = length( cac );
      out = struct( 'DRAG_FACTOR',nan(1,len-1), 'FUEL_FACTOR',[], 'Table',[]  ...
                  , 'COST_INDEX' ,[]          , 'ALTITUDE'   ,[], 'ISA'  ,[]  );
      for jj = 2 : len
          out(jj-1) = handle_one_section_( cac{jj} );
      end
  end
  function    sas = handle_one_section_( str )
      sas = struct( 'DRAG_FACTOR',[], 'FUEL_FACTOR',[], 'Table',[]   ...
                  , 'COST_INDEX' ,[], 'ALTITUDE'   ,[], 'ISA'  ,[]   );
      sas.DRAG_FACTOR = excerpt_num_( str, 'DRAG FACTOR' );
      sas.FUEL_FACTOR = excerpt_num_( str, 'FUEL FACTOR' );
      sas.COST_INDEX = excerpt_colon_separated_num_( str, 'COST INDEX' );
      sas.ALTITUDE   = excerpt_colon_separated_num_( str, 'ALTITUDE' );
      sas.ISA        = excerpt_colon_separated_num_( str, 'ISA' );
      sas.Table = excerpt_table_( str );
  end
  function  val = excerpt_num_( str, name )
      buf = regexp( str, [ '(?<=', name, ')', '[ ]+[\d\.]+' ], 'match', 'once' );
      val = str2double( buf );
  end
  function  val = excerpt_table_( str )
      %   Q&D, quick and dirty, search a numeric sequecne, which is at least 100 character 
      %   long. PROBLEM: requires that the preceeding line ends with a "non-numeric" 
      %   character and that the following line begins with a "non-numeric" character. 
      buf = regexp( str, '[\d\.\s]{100,}', 'match', 'once' );
      val = str2num( buf );
  end
  function  val = excerpt_colon_separated_num_( str, name )
      buf = regexp( str, [ '(?<=', name, ')', '(?:[ \:\-]+)([\d\.])+' ], 'tokens', 'once' );
      val = str2double( buf{:} );
  end

Related Solutions

MATLAB: How can I extract special contents of a text

regexp(yourstring, '(?<=\()[^)]+', 'match')

should do it. It matches any sequence of anything but closing brackets preceded by an opening bracket. Note that the opening bracket has to be escaped as it's a special character in regexes.

MATLAB: Extracting information from file

This is one way to read the your file

>> tic, sas = nohup, toc
sas = 
1x1173 struct array with fields:
    SampleName
    NumOfCells
    Porosity
Elapsed time is 27.000338 seconds.
>> ix = find( strcmp( {sas.SampleName}, 'cutTDM050_111_121_221_222_122' ) )
ix =
   583
>> sas(ix).Porosity
ans =
    0.0828    0.0828    0.0828
>> sas(ix).NumOfCells
ans =
      125000      125000      125000

where (in one m-file)

function    sas = nohup
%%    
    str = fileread( 'nohup.txt' ); 
%%
    heading_string  = 'Running Sample';
    trailing_string = '=============================================='; 
    %
    xpr = sprintf( '(?<=%s).+?(?=%s)', heading_string, trailing_string );
    cac = regexp( str, xpr, 'match' );
%% 
    sas = struct( 'SampleName',repmat({''},[1,length(cac)]) ...
                , 'NumOfCells',{[]}, 'Porosity', {[]}       );
    for jj = 1 : length( cac )
        sas(jj) = nohup_( cac{jj} ); 
    end
end
function    sas = nohup_( str )
    %
    sas.SampleName ... 
    =   regexp( str, 'cutTDM\d{3}_\d{3}_\d{3}_\d{3}_\d{3}_\d{3}', 'match', 'once' );
    %
    cac = regexp( str, '(?<=Num of cells +\= *)\d+', 'match' ); 
    sas.NumOfCells = str2double( cac );
    %
    cac = regexp( str, '(?<=Porosity +\= *)[\d+\.]+', 'match' ); 
    sas.Porosity = str2double( cac );
end

&nbsp

Comments:

The function is slow. Nearly all the time is spend with regexp searching for "Num of cells" and "Porosity". "the Num of cells and porosity value are the same." may be used improve speed. Adding 'once' to these two calls of regexp increases the speed forty times. That's much more than I anticipated; I don't understand; I cannot see what's taking all the extra time.

>> tic, sas = nohup, toc
sas = 
1x1173 struct array with fields:
    SampleName
    NumOfCells
    Porosity
Elapsed time is 0.645206 seconds.
>> ix = find( strcmp( {sas.SampleName}, 'cutTDM050_111_121_221_222_122' ) )
ix =
   583
>> sas(ix).Porosity
ans =
    0.0828
>> sas(ix).NumOfCells
ans =
      125000
>>

Best Answer

Related Solutions

MATLAB: How can I extract special contents of a text

MATLAB: Extracting information from file

Related Question