Whizzo's Development Blog: C# Tokenizer Class

Howdy,

This class is something I distilled from a C library while translating a piece of code that I needed for doing face detection, I ran into this handy class for reading configuration files. Normally you use the ConfigurationManager of .NET to read your app.config files etc..., but this is a handy class to parse files or make very simple flat-files that contain some configuration info.

If you don't want to use XML, then is a very handy utility.

This tokenizer skips comment (#) chars, whitespaces and blank lines.
It also has two properties for knowing when a File ends of when a line ends.

I'm sure that there are other possibilities to do such things, but I find this one easy to use.

Enjoy ;)

   1: using System;

   2: using System.Text;

   3: using System.Diagnostics;

   4: using System.IO;

5:

   6: /******************************************************************************

   7: *                               TOKENIZER                                     *

   8: ******************************************************************************/

9:

  10: /// <summary>

  11: /// This class represents a tokenizer to read words

  12: /// from a datasource. (Stream or String)

  13: /// </summary>

  14: public class Tokenizer

  15: {

  16:     #region Fields & Delegates

17:

  18:     public delegate int TokenizerDataSourceProc(char[] buff, int nChars, object arg);

19:

  20:     private static int _stringPos, _streamPos;

21:

  22:     private int[,] stateTransitionTable, stateActionTable;

23:

  24:     private char[] in_buff = new char[512];

  25:     private char[] word_buff = new char[100];

  26:     private int state;

  27:     private int endOfLine;

  28:     private int endOfFile;

  29:     private int availChars;

  30:     private int iPos, wPos;

  31:     private TokenizerDataSourceProc dataSource;

  32:     private object dataSrcArg;

33:

  34:     #endregion

35:

  36:     #region Constructor(s)

37:

  38:     /// <summary>

  39:     /// Initializes an instance of the tokenizer.

  40:     /// </summary>

  41:     /// <param name="source">

  42:     ///     The 'source' argument is a pointer to a function which returns data.

  43:     ///     Currently two supported functions are embedded in this class.

  44:     ///     - StreamReader

  45:     ///     - StringReader

  46:     /// </param>

  47:     /// <param name="arg">Datasource for this tokenizer ( Stream or String )</param>

  48:     public Tokenizer(TokenizerDataSourceProc source, object arg)

  49:     {

  50:         _streamPos = _stringPos = 0;

  51:         this.endOfLine = 0;

  52:         this.endOfFile = 0;

  53:         this.state = 0;

  54:         this.availChars = 0;

  55:         this.dataSource = source;

  56:         this.dataSrcArg = arg;

57:

  58:         this.stateTransitionTable = new int[7, 5] {

  59:             {6, 0, 0, 1, 2},    /* State 0 */

  60:             {6, 1, 0, 1, 1},        /* State 1 */

  61:             {6, 3, 4, 5, 2},        /* State 2 */

  62:             {6, 3, 4, 5, 2},        /* State 3 */

  63:             {6, 4, 4, 5, 2},        /* State 4 */

  64:             {6, 5, 4, 5, 5},        /* State 5 */

  65:             {6, 6, 6, 6, 6}        /* State 6 */

  66:           };

67:

  68:         /* Each state transition may be associated with one or more actions,

  69:          * defined by these bits:

  70:          *

  71:          *   0x1 = CPY: Copy the current char to the word buffer

  72:          *   0x2 = RET: Exit from loop and return contents of word buffer

  73:          *   0x4 = EOF: Assert end-of-file flag

  74:          *   0x8 = EOL: Assert end-of-line flag

  75:          *

  76:          * These actions may be combined:

  77:          *

  78:          *   0x3 = CPY + RET

  79:          *   0x6 = RET + EOF

  80:          *   0xB = CPY + RET + EOL

  81:          *   0xE = RET + EOF + EOL

  82:          */

  83:         this.stateActionTable = new int[7, 5]{

  84:             {0x6, 0x0, 0x0, 0x0, 0x1},    /* State 0 */

  85:             {0x6, 0x0, 0x0, 0x0, 0x0},    /* State 1 */

  86:             {0xE, 0x0, 0x0, 0x0, 0x1},    /* State 2 */

  87:             {0xE, 0x0, 0x0, 0x0, 0x3},    /* State 3 */

  88:             {0xE, 0x0, 0x0, 0x0, 0xB},    /* State 4 */

  89:             {0xE, 0x0, 0x0, 0x0, 0x0},    /* State 5 */

  90:             {0x6, 0x6, 0x6, 0x6, 0x6}    /* State 6 */

  91:           };

  92:     }

93:

  94:     #endregion

95:

  96:     #region Static Delegate Handler Functions

97:

  98:     /// <summary>

  99:     /// Data source procedure for reading data from a file.

 100:     /// This function is provided as an argument to the constructor.

 101:     /// </summary>

 102:     /// <param name="buff"></param>

 103:     /// <param name="nChars"></param>

 104:     /// <param name="arg"></param>

 105:     /// <returns></returns>

 106:     public static int StreamReader(char[] buff, int nChars, object arg)

 107:     {

 108:         StreamReader sr = arg as StreamReader;

 109:         int n = sr.Read(buff, _streamPos, nChars);

 110:         _streamPos += n;

 111:         return n;

 112:     }

 113:

 114:     /// <summary>

 115:     /// Data source function for reading data from a null terminated string.

 116:     /// This function is provided as an argument to the constructor.

 117:     /// </summary>

 118:     /// <param name="buff"></param>

 119:     /// <param name="nChars"></param>

 120:     /// <param name="arg"></param>

 121:     /// <returns></returns>

 122:     public static int StringReader(char[] buff, int nChars, object arg)

 123:     {

 124:         string c = arg as string;

 125:         int n;

 126:

 127:         for (n = _stringPos; n < nChars && n < c.Length; n++)

 128:             buff[n] = c[n];

 129:

 130:         if (_stringPos != n)

 131:             _stringPos += n;

 132:         else

 133:             n = 0;

 134:

 135:         return n;

 136:     }

 137:

 138:     #endregion

 139:

 140:     #region Public Methods

 141:

 142:     /// <summary>

 143:     /// Reads a single word from the stream. The string returned by this

 144:     /// function is guaranteed to be valid only until the next invocation

 145:     /// of 'GetWord'.

 146:     /// </summary>

 147:     /// <returns>The next word in the datasource</returns>

 148:     /// <remarks>

 149:     /// The tokenizer has the following properties:

 150:     ///     - Blank lines are ignored

 151:     ///     - Whitespace before or after words are ignored

 152:     ///     - Comments can be introduced by a '#' sign

 153:     ///     - 'EndOfLine' can be used to test whether the word

 154:     ///       was the last word of a line

 155:     ///     - 'EndOfFile' is guaranteed to return true if there

 156:     ///       are no other words in the file and the next call to

 157:     ///       'GetWord' would fail

 158:     /// </remarks>

 159:     public string GetWord()

 160:     {

 161:         char c;

 162:         int charClass, emitWord, saveChar;

 163:

 164:         /* The tokenizer is implemented as a seven-state state machine. State

 165:          * zero is the inital state, state six is the exit state. The states

 166:          * are roughly described as follows:

 167:          *

 168:          *    0 - Inital state: Eat whitespace

 169:          *    1 - Eat comments #1

 170:          *    2 - Save word characters

 171:          *    3 - Eat whitespace

 172:          *    4 - Eat whitespace and newlines

 173:          *    5 - Eat comments #2

 174:          *    6 - End-of-file reached

 175:          *

 176:          * States 1 and 5 are subtely different and represent knowledge about

 177:          * whether there is a word in the word buffer which is waiting to be

 178:          * returned. In general, the state machine can only return a word after

 179:          * it has examined all the characters preceeding the starting character

 180:          * of the next word. This is because the tokenizer must properly

 181:          * indicate that a word is the last valid word in the stream even if

 182:          * that word happens to be followed by several empty lines and/or

 183:          * comments. The need to defer this decision accounts for much of the

 184:          * complexity of the state machine.

 185:          *

 186:          * The rows of the state transition table represent states while the

 187:          * columns corresponds to one of five character classes:

 188:          *

 189:          *    0 - Eof reached        (not a character)

 190:          *    1 - whitespace         (' '  or '\t')

 191:          *    2 - newlines           ('\n' or '\r')

 192:          *    3 - Comment initiator  ('#')

 193:          *    4 - Word char          (everything else)

 194:          */

 195:

 196:

 197:         /* Prepare to read new word into word_buff */

 198:         this.word_buff = new char[this.word_buff.Length];

 199:         this.wPos = 0;// tok->word_buff;

 200:

 201:         /* Begin processing characters according to

 202:          * the transition rules */

 203:

 204:         while (true)

 205:         {

 206:             /* Read in a new chunk of data, if needed */

 207:

 208:             if (this.availChars == 0)

 209:             {

 210:                 this.availChars = this.dataSource(this.in_buff, this.in_buff.Length, this.dataSrcArg);

 211:                 this.iPos = 0;

 212:             }

 213:

 214:             /* Look at the current character and find its character class */

 215:

 216:             c = this.in_buff[iPos];

 217:

 218:             if (this.availChars == 0)

 219:                 charClass = 0;

 220:             else if (c == ' ' || c == '\t')

 221:                 charClass = 1;

 222:             else if (c == '\n' || c == '\r')

 223:                 charClass = 2;

 224:             else if (c == '#')

 225:                 charClass = 3;

 226:             else

 227:                 charClass = 4;

 228:

 229:             /* Transition to next state based on the character on input */

 230:

 231:             saveChar = (stateActionTable[this.state, charClass] & 0x01);

 232:             emitWord = (stateActionTable[this.state, charClass] & 0x02);

 233:             this.endOfFile = (stateActionTable[this.state, charClass] & 0x04);

 234:             this.endOfLine = (stateActionTable[this.state, charClass] & 0x08);

 235:             this.state = stateTransitionTable[this.state, charClass];

 236:

 237:             /* If we transitioned to an accepting state, break out

 238:              * of the loop so that we can return the token to the caller

 239:              */

 240:

 241:             if (emitWord != 0)

 242:                 break;

 243:

 244:             /* If the transition requires it, copy the character into the

 245:              * word buffer */

 246:

 247:             if (saveChar != 0)

 248:             {

 249:                 this.word_buff[this.wPos++] = c;

 250:             }

 251:             /* Advance to the next character in the stream */

 252:             this.iPos++;

 253:             this.availChars--;

 254:         }

 255:

 256:         /* Add a NULL terminator, then return the contents of the word

 257:          * buffer to the caller */

 258:

 259:         this.word_buff[wPos] = '\0';

 260:         return new string(this.word_buff);

 261:     }

 262:

 263:     #endregion

 264:

 265:     #region Properties

 266:

 267:     /// <summary>

 268:     /// Queries whether the last call to 'GetWord' returned the last

 269:     /// valid word of the file or stream

 270:     /// </summary>

 271:     public int EndOfFile

 272:     {

 273:         get

 274:         {

 275:             return this.endOfFile;

 276:         }

 277:     }

 278:

 279:     /// <summary>

 280:     /// Queries whether the last call to 'GetWord' returned the last

 281:     /// word on a line (the last word of the file is always said to be on the

 282:     /// end of the line, whether or not that line is followed by a newline)

 283:     /// </summary>

 284:     public int EndOfLine

 285:     {

 286:         get { return this.endOfLine; }

 287:     }

 288:

 289:     #endregion

 290:

 291:     #region Test-Function

 292:

 293:     /// <summary>

 294:     /// This function tests some of the key functionality of the tokenizer.

 295:     /// It aborts the program is something goes awry, but has no side

 296:     /// effects if everything checks out

 297:     /// </summary>

 298:     public static void TestSuite()

 299:     {

 300:         string msg = "Tokenizer test failed";

 301:         string test = "Hello, World!";

 302:

 303:         Tokenizer tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);

 304:

 305:         Debug.Assert(tok.GetWord().CompareTo("Hello,") == 0, msg);

 306:         Debug.Assert(tok.EndOfLine == 0, msg);

 307:         Debug.Assert(tok.EndOfFile == 0, msg);

 308:         Debug.Assert(tok.GetWord().CompareTo("World!") == 0, msg);

 309:         Debug.Assert(tok.EndOfLine != 0, msg);

 310:         Debug.Assert(tok.EndOfFile != 0, msg);

 311:

 312:         test = "\n\nYogi # Bear ";

 313:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);

 314:         Debug.Assert(tok.GetWord().CompareTo("Yogi") == 0, msg);

 315:         Debug.Assert(tok.EndOfLine != 0, msg);

 316:         Debug.Assert(tok.EndOfFile != 0, msg);

 317:

 318:         test = "";

 319:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);

 320:         Debug.Assert(tok.GetWord().CompareTo("") == 0, msg);

 321:         Debug.Assert(tok.EndOfLine == 0, msg);

 322:         Debug.Assert(tok.EndOfFile != 0, msg);

 323:

 324:         test = "\n  \na\tb##b\n#d";

 325:         tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);

 326:         Debug.Assert(tok.GetWord().CompareTo("a") == 0, msg);

 327:         Debug.Assert(tok.EndOfLine == 0, msg);

 328:         Debug.Assert(tok.EndOfFile == 0, msg);

 329:

 330:         Debug.Assert(tok.GetWord().CompareTo("b") == 0, msg);

 331:         Debug.Assert(tok.EndOfLine != 0, msg);

 332:         Debug.Assert(tok.EndOfFile != 0, msg);

 333:     }

 334:

 335:     #endregion

 336: }

Whizzo's Development Blog

Friday, August 31, 2007

C# Tokenizer Class

No comments: