Friday, August 31, 2007

C# Tokenizer Class

Howdy,

This class is something I distilled from a C library while translating a piece of code that I needed for doing face detection, I ran into this handy class for reading configuration files. Normally you use the ConfigurationManager of .NET to read your app.config files etc..., but this is a handy class to parse files or make very simple flat-files that contain some configuration info.

If you don't want to use XML, then is a very handy utility.

This tokenizer skips comment (#) chars, whitespaces and blank lines.
It also has two properties for knowing when a File ends of when a line ends.

I'm sure that there are other possibilities to do such things, but I find this one easy to use.

Enjoy ;)


   1: using System;
   2: using System.Text;
   3: using System.Diagnostics;
   4: using System.IO;
   5:  
   6: /******************************************************************************
   7: *                               TOKENIZER                                     *
   8: ******************************************************************************/
   9:  
  10: /// <summary>
  11: /// This class represents a tokenizer to read words
  12: /// from a datasource. (Stream or String)
  13: /// </summary>
  14: public class Tokenizer
  15: {
  16:     #region Fields & Delegates
  17:  
  18:     public delegate int TokenizerDataSourceProc(char[] buff, int nChars, object arg);
  19:  
  20:     private static int _stringPos, _streamPos;
  21:  
  22:     private int[,] stateTransitionTable, stateActionTable;
  23:  
  24:     private char[] in_buff = new char[512];
  25:     private char[] word_buff = new char[100];
  26:     private int state;
  27:     private int endOfLine;
  28:     private int endOfFile;
  29:     private int availChars;
  30:     private int iPos, wPos;
  31:     private TokenizerDataSourceProc dataSource;
  32:     private object dataSrcArg;
  33:  
  34:     #endregion
  35:  
  36:     #region Constructor(s)
  37:  
  38:     /// <summary>
  39:     /// Initializes an instance of the tokenizer.
  40:     /// </summary>
  41:     /// <param name="source">
  42:     ///     The 'source' argument is a pointer to a function which returns data.
  43:     ///     Currently two supported functions are embedded in this class.
  44:     ///     - StreamReader
  45:     ///     - StringReader
  46:     /// </param>
  47:     /// <param name="arg">Datasource for this tokenizer ( Stream or String )</param>
  48:     public Tokenizer(TokenizerDataSourceProc source, object arg)
  49:     {
  50:         _streamPos = _stringPos = 0;
  51:         this.endOfLine = 0;
  52:         this.endOfFile = 0;
  53:         this.state = 0;
  54:         this.availChars = 0;
  55:         this.dataSource = source;
  56:         this.dataSrcArg = arg;
  57:  
  58:         this.stateTransitionTable = new int[7, 5] {
  59:             {6, 0, 0, 1, 2},    /* State 0 */
  60:             {6, 1, 0, 1, 1},        /* State 1 */
  61:             {6, 3, 4, 5, 2},        /* State 2 */
  62:             {6, 3, 4, 5, 2},        /* State 3 */
  63:             {6, 4, 4, 5, 2},        /* State 4 */
  64:             {6, 5, 4, 5, 5},        /* State 5 */
  65:             {6, 6, 6, 6, 6}        /* State 6 */
  66:           };
  67:  
  68:         /* Each state transition may be associated with one or more actions,
  69:          * defined by these bits:
  70:          *
  71:          *   0x1 = CPY: Copy the current char to the word buffer
  72:          *   0x2 = RET: Exit from loop and return contents of word buffer
  73:          *   0x4 = EOF: Assert end-of-file flag
  74:          *   0x8 = EOL: Assert end-of-line flag
  75:          *
  76:          * These actions may be combined:
  77:          *
  78:          *   0x3 = CPY + RET
  79:          *   0x6 = RET + EOF
  80:          *   0xB = CPY + RET + EOL
  81:          *   0xE = RET + EOF + EOL
  82:          */
  83:         this.stateActionTable = new int[7, 5]{
  84:             {0x6, 0x0, 0x0, 0x0, 0x1},    /* State 0 */
  85:             {0x6, 0x0, 0x0, 0x0, 0x0},    /* State 1 */
  86:             {0xE, 0x0, 0x0, 0x0, 0x1},    /* State 2 */
  87:             {0xE, 0x0, 0x0, 0x0, 0x3},    /* State 3 */
  88:             {0xE, 0x0, 0x0, 0x0, 0xB},    /* State 4 */
  89:             {0xE, 0x0, 0x0, 0x0, 0x0},    /* State 5 */
  90:             {0x6, 0x6, 0x6, 0x6, 0x6}    /* State 6 */
  91:           };
  92:     }
  93:  
  94:     #endregion
  95:  
  96:     #region Static Delegate Handler Functions
  97:  
  98:     /// <summary>
  99:     /// Data source procedure for reading data from a file.
 100:     /// This function is provided as an argument to the constructor.
 101:     /// </summary>
 102:     /// <param name="buff"></param>
 103:     /// <param name="nChars"></param>
 104:     /// <param name="arg"></param>
 105:     /// <returns></returns>
 106:     public static int StreamReader(char[] buff, int nChars, object arg)
 107:     {
 108:         StreamReader sr = arg as StreamReader;
 109:         int n = sr.Read(buff, _streamPos, nChars);
 110:         _streamPos += n;
 111:         return n;
 112:     }
 113:  
 114:     /// <summary>
 115:     /// Data source function for reading data from a null terminated string.
 116:     /// This function is provided as an argument to the constructor.
 117:     /// </summary>
 118:     /// <param name="buff"></param>
 119:     /// <param name="nChars"></param>
 120:     /// <param name="arg"></param>
 121:     /// <returns></returns>
 122:     public static int StringReader(char[] buff, int nChars, object arg)
 123:     {
 124:         string c = arg as string;
 125:         int n;
 126:  
 127:         for (n = _stringPos; n < nChars && n < c.Length; n++)
 128:             buff[n] = c[n];
 129:  
 130:         if (_stringPos != n)
 131:             _stringPos += n;
 132:         else
 133:             n = 0;
 134:  
 135:         return n;
 136:     }
 137:  
 138:     #endregion
 139:  
 140:     #region Public Methods
 141:  
 142:     /// <summary>
 143:     /// Reads a single word from the stream. The string returned by this
 144:     /// function is guaranteed to be valid only until the next invocation
 145:     /// of 'GetWord'.
 146:     /// </summary>
 147:     /// <returns>The next word in the datasource</returns>
 148:     /// <remarks>
 149:     /// The tokenizer has the following properties:
 150:     ///     - Blank lines are ignored
 151:     ///     - Whitespace before or after words are ignored
 152:     ///     - Comments can be introduced by a '#' sign
 153:     ///     - 'EndOfLine' can be used to test whether the word
 154:     ///       was the last word of a line
 155:     ///     - 'EndOfFile' is guaranteed to return true if there
 156:     ///       are no other words in the file and the next call to
 157:     ///       'GetWord' would fail
 158:     /// </remarks>
 159:     public string GetWord()
 160:     {
 161:         char c;
 162:         int charClass, emitWord, saveChar;
 163:  
 164:         /* The tokenizer is implemented as a seven-state state machine. State
 165:          * zero is the inital state, state six is the exit state. The states
 166:          * are roughly described as follows:
 167:          *
 168:          *    0 - Inital state: Eat whitespace
 169:          *    1 - Eat comments #1
 170:          *    2 - Save word characters
 171:          *    3 - Eat whitespace
 172:          *    4 - Eat whitespace and newlines
 173:          *    5 - Eat comments #2
 174:          *    6 - End-of-file reached
 175:          *
 176:          * States 1 and 5 are subtely different and represent knowledge about
 177:          * whether there is a word in the word buffer which is waiting to be
 178:          * returned. In general, the state machine can only return a word after
 179:          * it has examined all the characters preceeding the starting character
 180:          * of the next word. This is because the tokenizer must properly
 181:          * indicate that a word is the last valid word in the stream even if
 182:          * that word happens to be followed by several empty lines and/or
 183:          * comments. The need to defer this decision accounts for much of the
 184:          * complexity of the state machine.
 185:          *
 186:          * The rows of the state transition table represent states while the
 187:          * columns corresponds to one of five character classes:
 188:          *
 189:          *    0 - Eof reached        (not a character)
 190:          *    1 - whitespace         (' '  or '\t')
 191:          *    2 - newlines           ('\n' or '\r')
 192:          *    3 - Comment initiator  ('#')
 193:          *    4 - Word char          (everything else)
 194:          */
 195:  
 196:  
 197:         /* Prepare to read new word into word_buff */
 198:         this.word_buff = new char[this.word_buff.Length];
 199:         this.wPos = 0;// tok->word_buff;
 200:  
 201:         /* Begin processing characters according to
 202:          * the transition rules */
 203:  
 204:         while (true)
 205:         {
 206:             /* Read in a new chunk of data, if needed */
 207:  
 208:             if (this.availChars == 0)
 209:             {
 210:                 this.availChars = this.dataSource(this.in_buff, this.in_buff.Length, this.dataSrcArg);
 211:                 this.iPos = 0;
 212:             }
 213:  
 214:             /* Look at the current character and find its character class */
 215:  
 216:             c = this.in_buff[iPos];
 217:  
 218:             if (this.availChars == 0)
 219:                 charClass = 0;
 220:             else if (c == ' ' || c == '\t')
 221:                 charClass = 1;
 222:             else if (c == '\n' || c == '\r')
 223:                 charClass = 2;
 224:             else if (c == '#')
 225:                 charClass = 3;
 226:             else
 227:                 charClass = 4;
 228:  
 229:             /* Transition to next state based on the character on input */
 230:  
 231:             saveChar = (stateActionTable[this.state, charClass] & 0x01);
 232:             emitWord = (stateActionTable[this.state, charClass] & 0x02);
 233:             this.endOfFile = (stateActionTable[this.state, charClass] & 0x04);
 234:             this.endOfLine = (stateActionTable[this.state, charClass] & 0x08);
 235:             this.state = stateTransitionTable[this.state, charClass];
 236:  
 237:             /* If we transitioned to an accepting state, break out
 238:              * of the loop so that we can return the token to the caller
 239:              */
 240:  
 241:             if (emitWord != 0)
 242:                 break;
 243:  
 244:             /* If the transition requires it, copy the character into the
 245:              * word buffer */
 246:  
 247:             if (saveChar != 0)
 248:             {
 249:                 this.word_buff[this.wPos++] = c;
 250:             }
 251:             /* Advance to the next character in the stream */
 252:             this.iPos++;
 253:             this.availChars--;
 254:         }
 255:  
 256:         /* Add a NULL terminator, then return the contents of the word
 257:          * buffer to the caller */
 258:  
 259:         this.word_buff[wPos] = '\0';
 260:         return new string(this.word_buff);
 261:     }
 262:  
 263:     #endregion
 264:  
 265:     #region Properties
 266:  
 267:     /// <summary>
 268:     /// Queries whether the last call to 'GetWord' returned the last
 269:     /// valid word of the file or stream
 270:     /// </summary>
 271:     public int EndOfFile
 272:     {
 273:         get
 274:         {
 275:             return this.endOfFile;
 276:         }
 277:     }
 278:  
 279:     /// <summary>
 280:     /// Queries whether the last call to 'GetWord' returned the last
 281:     /// word on a line (the last word of the file is always said to be on the
 282:     /// end of the line, whether or not that line is followed by a newline)
 283:     /// </summary>
 284:     public int EndOfLine
 285:     {
 286:         get { return this.endOfLine; }
 287:     }
 288:  
 289:     #endregion
 290:  
 291:     #region Test-Function
 292:  
 293:     /// <summary>
 294:     /// This function tests some of the key functionality of the tokenizer.
 295:     /// It aborts the program is something goes awry, but has no side
 296:     /// effects if everything checks out
 297:     /// </summary>
 298:     public static void TestSuite()
 299:     {
 300:         string msg = "Tokenizer test failed";
 301:         string test = "Hello, World!";
 302:  
 303:         Tokenizer tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
 304:  
 305:         Debug.Assert(tok.GetWord().CompareTo("Hello,") == 0, msg);
 306:         Debug.Assert(tok.EndOfLine == 0, msg);
 307:         Debug.Assert(tok.EndOfFile == 0, msg);
 308:         Debug.Assert(tok.GetWord().CompareTo("World!") == 0, msg);
 309:         Debug.Assert(tok.EndOfLine != 0, msg);
 310:         Debug.Assert(tok.EndOfFile != 0, msg);
 311:  
 312:         test = "\n\nYogi # Bear ";
 313:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
 314:         Debug.Assert(tok.GetWord().CompareTo("Yogi") == 0, msg);
 315:         Debug.Assert(tok.EndOfLine != 0, msg);
 316:         Debug.Assert(tok.EndOfFile != 0, msg);
 317:  
 318:         test = "";
 319:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
 320:         Debug.Assert(tok.GetWord().CompareTo("") == 0, msg);
 321:         Debug.Assert(tok.EndOfLine == 0, msg);
 322:         Debug.Assert(tok.EndOfFile != 0, msg);
 323:  
 324:         test = "\n  \na\tb##b\n#d";
 325:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
 326:         Debug.Assert(tok.GetWord().CompareTo("a") == 0, msg);
 327:         Debug.Assert(tok.EndOfLine == 0, msg);
 328:         Debug.Assert(tok.EndOfFile == 0, msg);
 329:  
 330:         Debug.Assert(tok.GetWord().CompareTo("b") == 0, msg);
 331:         Debug.Assert(tok.EndOfLine != 0, msg);
 332:         Debug.Assert(tok.EndOfFile != 0, msg);
 333:     }
 334:  
 335:     #endregion
 336: }