This class is something I distilled from a C library while translating a piece of code that I needed for doing face detection, I ran into this handy class for reading configuration files. Normally you use the ConfigurationManager of .NET to read your app.config files etc..., but this is a handy class to parse files or make very simple flat-files that contain some configuration info.
If you don't want to use XML, then is a very handy utility.
This tokenizer skips comment (#) chars, whitespaces and blank lines.
It also has two properties for knowing when a File ends of when a line ends.
I'm sure that there are other possibilities to do such things, but I find this one easy to use.
Enjoy ;)
1: using System;
2: using System.Text;
3: using System.Diagnostics;
4: using System.IO;
   5:  6: /******************************************************************************
7: * TOKENIZER *
8: ******************************************************************************/
   9:  10: /// <summary>
11: /// This class represents a tokenizer to read words
12: /// from a datasource. (Stream or String)
13: /// </summary>
14: public class Tokenizer
  15: {16: #region Fields & Delegates
  17:  18: public delegate int TokenizerDataSourceProc(char[] buff, int nChars, object arg);
  19:  20: private static int _stringPos, _streamPos;
  21:  22: private int[,] stateTransitionTable, stateActionTable;
  23:  24: private char[] in_buff = new char[512];
25: private char[] word_buff = new char[100];
26: private int state;
27: private int endOfLine;
28: private int endOfFile;
29: private int availChars;
30: private int iPos, wPos;
31: private TokenizerDataSourceProc dataSource;
32: private object dataSrcArg;
  33:  34: #endregion
  35:  36: #region Constructor(s)
  37:  38: /// <summary>
39: /// Initializes an instance of the tokenizer.
40: /// </summary>
41: /// <param name="source">
42: /// The 'source' argument is a pointer to a function which returns data.
43: /// Currently two supported functions are embedded in this class.
44: /// - StreamReader
45: /// - StringReader
46: /// </param>
47: /// <param name="arg">Datasource for this tokenizer ( Stream or String )</param>
48: public Tokenizer(TokenizerDataSourceProc source, object arg)
  49:     {  50:         _streamPos = _stringPos = 0;51: this.endOfLine = 0;
52: this.endOfFile = 0;
53: this.state = 0;
54: this.availChars = 0;
55: this.dataSource = source;
56: this.dataSrcArg = arg;
  57:  58: this.stateTransitionTable = new int[7, 5] {
59: {6, 0, 0, 1, 2}, /* State 0 */
60: {6, 1, 0, 1, 1}, /* State 1 */
61: {6, 3, 4, 5, 2}, /* State 2 */
62: {6, 3, 4, 5, 2}, /* State 3 */
63: {6, 4, 4, 5, 2}, /* State 4 */
64: {6, 5, 4, 5, 5}, /* State 5 */
65: {6, 6, 6, 6, 6} /* State 6 */
  66:           };  67:  68: /* Each state transition may be associated with one or more actions,
69: * defined by these bits:
70: *
71: * 0x1 = CPY: Copy the current char to the word buffer
72: * 0x2 = RET: Exit from loop and return contents of word buffer
73: * 0x4 = EOF: Assert end-of-file flag
74: * 0x8 = EOL: Assert end-of-line flag
75: *
76: * These actions may be combined:
77: *
78: * 0x3 = CPY + RET
79: * 0x6 = RET + EOF
80: * 0xB = CPY + RET + EOL
81: * 0xE = RET + EOF + EOL
82: */
83: this.stateActionTable = new int[7, 5]{
84: {0x6, 0x0, 0x0, 0x0, 0x1}, /* State 0 */
85: {0x6, 0x0, 0x0, 0x0, 0x0}, /* State 1 */
86: {0xE, 0x0, 0x0, 0x0, 0x1}, /* State 2 */
87: {0xE, 0x0, 0x0, 0x0, 0x3}, /* State 3 */
88: {0xE, 0x0, 0x0, 0x0, 0xB}, /* State 4 */
89: {0xE, 0x0, 0x0, 0x0, 0x0}, /* State 5 */
90: {0x6, 0x6, 0x6, 0x6, 0x6} /* State 6 */
  91:           };  92:     }  93:  94: #endregion
  95:  96: #region Static Delegate Handler Functions
  97:  98: /// <summary>
99: /// Data source procedure for reading data from a file.
100: /// This function is provided as an argument to the constructor.
101: /// </summary>
102: /// <param name="buff"></param>
103: /// <param name="nChars"></param>
104: /// <param name="arg"></param>
105: /// <returns></returns>
106: public static int StreamReader(char[] buff, int nChars, object arg)
 107:     {108: StreamReader sr = arg as StreamReader;
109: int n = sr.Read(buff, _streamPos, nChars);
 110:         _streamPos += n;111: return n;
 112:     } 113:  114: /// <summary>
115: /// Data source function for reading data from a null terminated string.
116: /// This function is provided as an argument to the constructor.
117: /// </summary>
118: /// <param name="buff"></param>
119: /// <param name="nChars"></param>
120: /// <param name="arg"></param>
121: /// <returns></returns>
122: public static int StringReader(char[] buff, int nChars, object arg)
 123:     {124: string c = arg as string;
125: int n;
 126:  127: for (n = _stringPos; n < nChars && n < c.Length; n++)
 128:             buff[n] = c[n]; 129:  130: if (_stringPos != n)
 131:             _stringPos += n;132: else
 133:             n = 0; 134:  135: return n;
 136:     } 137:  138: #endregion
 139:  140: #region Public Methods
 141:  142: /// <summary>
143: /// Reads a single word from the stream. The string returned by this
144: /// function is guaranteed to be valid only until the next invocation
145: /// of 'GetWord'.
146: /// </summary>
147: /// <returns>The next word in the datasource</returns>
148: /// <remarks>
149: /// The tokenizer has the following properties:
150: /// - Blank lines are ignored
151: /// - Whitespace before or after words are ignored
152: /// - Comments can be introduced by a '#' sign
153: /// - 'EndOfLine' can be used to test whether the word
154: /// was the last word of a line
155: /// - 'EndOfFile' is guaranteed to return true if there
156: /// are no other words in the file and the next call to
157: /// 'GetWord' would fail
158: /// </remarks>
159: public string GetWord()
 160:     {161: char c;
162: int charClass, emitWord, saveChar;
 163:  164: /* The tokenizer is implemented as a seven-state state machine. State
165: * zero is the inital state, state six is the exit state. The states
166: * are roughly described as follows:
167: *
168: * 0 - Inital state: Eat whitespace
169: * 1 - Eat comments #1
170: * 2 - Save word characters
171: * 3 - Eat whitespace
172: * 4 - Eat whitespace and newlines
173: * 5 - Eat comments #2
174: * 6 - End-of-file reached
175: *
176: * States 1 and 5 are subtely different and represent knowledge about
177: * whether there is a word in the word buffer which is waiting to be
178: * returned. In general, the state machine can only return a word after
179: * it has examined all the characters preceeding the starting character
180: * of the next word. This is because the tokenizer must properly
181: * indicate that a word is the last valid word in the stream even if
182: * that word happens to be followed by several empty lines and/or
183: * comments. The need to defer this decision accounts for much of the
184: * complexity of the state machine.
185: *
186: * The rows of the state transition table represent states while the
187: * columns corresponds to one of five character classes:
188: *
189: * 0 - Eof reached (not a character)
190: * 1 - whitespace (' ' or '\t')
191: * 2 - newlines ('\n' or '\r')
192: * 3 - Comment initiator ('#')
193: * 4 - Word char (everything else)
194: */
 195:   196:  197: /* Prepare to read new word into word_buff */
198: this.word_buff = new char[this.word_buff.Length];
199: this.wPos = 0;// tok->word_buff;
 200:  201: /* Begin processing characters according to
202: * the transition rules */
 203:  204: while (true)
 205:         {206: /* Read in a new chunk of data, if needed */
 207:  208: if (this.availChars == 0)
 209:             {210: this.availChars = this.dataSource(this.in_buff, this.in_buff.Length, this.dataSrcArg);
211: this.iPos = 0;
 212:             } 213:  214: /* Look at the current character and find its character class */
 215:  216: c = this.in_buff[iPos];
 217:  218: if (this.availChars == 0)
 219:                 charClass = 0;220: else if (c == ' ' || c == '\t')
 221:                 charClass = 1;222: else if (c == '\n' || c == '\r')
 223:                 charClass = 2;224: else if (c == '#')
 225:                 charClass = 3;226: else
 227:                 charClass = 4; 228:  229: /* Transition to next state based on the character on input */
 230:  231: saveChar = (stateActionTable[this.state, charClass] & 0x01);
232: emitWord = (stateActionTable[this.state, charClass] & 0x02);
233: this.endOfFile = (stateActionTable[this.state, charClass] & 0x04);
234: this.endOfLine = (stateActionTable[this.state, charClass] & 0x08);
235: this.state = stateTransitionTable[this.state, charClass];
 236:  237: /* If we transitioned to an accepting state, break out
238: * of the loop so that we can return the token to the caller
239: */
 240:  241: if (emitWord != 0)
242: break;
 243:  244: /* If the transition requires it, copy the character into the
245: * word buffer */
 246:  247: if (saveChar != 0)
 248:             {249: this.word_buff[this.wPos++] = c;
 250:             }251: /* Advance to the next character in the stream */
252: this.iPos++;
253: this.availChars--;
 254:         } 255:  256: /* Add a NULL terminator, then return the contents of the word
257: * buffer to the caller */
 258:  259: this.word_buff[wPos] = '\0';
260: return new string(this.word_buff);
 261:     } 262:  263: #endregion
 264:  265: #region Properties
 266:  267: /// <summary>
268: /// Queries whether the last call to 'GetWord' returned the last
269: /// valid word of the file or stream
270: /// </summary>
271: public int EndOfFile
 272:     { 273:         get 274:         {275: return this.endOfFile;
 276:         } 277:     } 278:  279: /// <summary>
280: /// Queries whether the last call to 'GetWord' returned the last
281: /// word on a line (the last word of the file is always said to be on the
282: /// end of the line, whether or not that line is followed by a newline)
283: /// </summary>
284: public int EndOfLine
 285:     {286: get { return this.endOfLine; }
 287:     } 288:  289: #endregion
 290:  291: #region Test-Function
 292:  293: /// <summary>
294: /// This function tests some of the key functionality of the tokenizer.
295: /// It aborts the program is something goes awry, but has no side
296: /// effects if everything checks out
297: /// </summary>
298: public static void TestSuite()
 299:     {300: string msg = "Tokenizer test failed";
301: string test = "Hello, World!";
 302:  303: Tokenizer tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
 304:  305: Debug.Assert(tok.GetWord().CompareTo("Hello,") == 0, msg);
 306:         Debug.Assert(tok.EndOfLine == 0, msg); 307:         Debug.Assert(tok.EndOfFile == 0, msg);308: Debug.Assert(tok.GetWord().CompareTo("World!") == 0, msg);
 309:         Debug.Assert(tok.EndOfLine != 0, msg); 310:         Debug.Assert(tok.EndOfFile != 0, msg); 311:  312: test = "\n\nYogi # Bear ";
313: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
314: Debug.Assert(tok.GetWord().CompareTo("Yogi") == 0, msg);
 315:         Debug.Assert(tok.EndOfLine != 0, msg); 316:         Debug.Assert(tok.EndOfFile != 0, msg); 317:  318: test = "";
319: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
320: Debug.Assert(tok.GetWord().CompareTo("") == 0, msg);
 321:         Debug.Assert(tok.EndOfLine == 0, msg); 322:         Debug.Assert(tok.EndOfFile != 0, msg); 323:  324: test = "\n \na\tb##b\n#d";
325: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
326: Debug.Assert(tok.GetWord().CompareTo("a") == 0, msg);
 327:         Debug.Assert(tok.EndOfLine == 0, msg); 328:         Debug.Assert(tok.EndOfFile == 0, msg); 329:  330: Debug.Assert(tok.GetWord().CompareTo("b") == 0, msg);
 331:         Debug.Assert(tok.EndOfLine != 0, msg); 332:         Debug.Assert(tok.EndOfFile != 0, msg); 333:     } 334:  335: #endregion
 336: } 
 
No comments:
Post a Comment