This class is something I distilled from a C library while translating a piece of code that I needed for doing face detection, I ran into this handy class for reading configuration files. Normally you use the ConfigurationManager of .NET to read your app.config files etc..., but this is a handy class to parse files or make very simple flat-files that contain some configuration info.
If you don't want to use XML, then is a very handy utility.
This tokenizer skips comment (#) chars, whitespaces and blank lines.
It also has two properties for knowing when a File ends of when a line ends.
I'm sure that there are other possibilities to do such things, but I find this one easy to use.
Enjoy ;)
1: using System;
2: using System.Text;
3: using System.Diagnostics;
4: using System.IO;
5:
6: /******************************************************************************
7: * TOKENIZER *
8: ******************************************************************************/
9:
10: /// <summary>
11: /// This class represents a tokenizer to read words
12: /// from a datasource. (Stream or String)
13: /// </summary>
14: public class Tokenizer
15: {
16: #region Fields & Delegates
17:
18: public delegate int TokenizerDataSourceProc(char[] buff, int nChars, object arg);
19:
20: private static int _stringPos, _streamPos;
21:
22: private int[,] stateTransitionTable, stateActionTable;
23:
24: private char[] in_buff = new char[512];
25: private char[] word_buff = new char[100];
26: private int state;
27: private int endOfLine;
28: private int endOfFile;
29: private int availChars;
30: private int iPos, wPos;
31: private TokenizerDataSourceProc dataSource;
32: private object dataSrcArg;
33:
34: #endregion
35:
36: #region Constructor(s)
37:
38: /// <summary>
39: /// Initializes an instance of the tokenizer.
40: /// </summary>
41: /// <param name="source">
42: /// The 'source' argument is a pointer to a function which returns data.
43: /// Currently two supported functions are embedded in this class.
44: /// - StreamReader
45: /// - StringReader
46: /// </param>
47: /// <param name="arg">Datasource for this tokenizer ( Stream or String )</param>
48: public Tokenizer(TokenizerDataSourceProc source, object arg)
49: {
50: _streamPos = _stringPos = 0;
51: this.endOfLine = 0;
52: this.endOfFile = 0;
53: this.state = 0;
54: this.availChars = 0;
55: this.dataSource = source;
56: this.dataSrcArg = arg;
57:
58: this.stateTransitionTable = new int[7, 5] {
59: {6, 0, 0, 1, 2}, /* State 0 */
60: {6, 1, 0, 1, 1}, /* State 1 */
61: {6, 3, 4, 5, 2}, /* State 2 */
62: {6, 3, 4, 5, 2}, /* State 3 */
63: {6, 4, 4, 5, 2}, /* State 4 */
64: {6, 5, 4, 5, 5}, /* State 5 */
65: {6, 6, 6, 6, 6} /* State 6 */
66: };
67:
68: /* Each state transition may be associated with one or more actions,
69: * defined by these bits:
70: *
71: * 0x1 = CPY: Copy the current char to the word buffer
72: * 0x2 = RET: Exit from loop and return contents of word buffer
73: * 0x4 = EOF: Assert end-of-file flag
74: * 0x8 = EOL: Assert end-of-line flag
75: *
76: * These actions may be combined:
77: *
78: * 0x3 = CPY + RET
79: * 0x6 = RET + EOF
80: * 0xB = CPY + RET + EOL
81: * 0xE = RET + EOF + EOL
82: */
83: this.stateActionTable = new int[7, 5]{
84: {0x6, 0x0, 0x0, 0x0, 0x1}, /* State 0 */
85: {0x6, 0x0, 0x0, 0x0, 0x0}, /* State 1 */
86: {0xE, 0x0, 0x0, 0x0, 0x1}, /* State 2 */
87: {0xE, 0x0, 0x0, 0x0, 0x3}, /* State 3 */
88: {0xE, 0x0, 0x0, 0x0, 0xB}, /* State 4 */
89: {0xE, 0x0, 0x0, 0x0, 0x0}, /* State 5 */
90: {0x6, 0x6, 0x6, 0x6, 0x6} /* State 6 */
91: };
92: }
93:
94: #endregion
95:
96: #region Static Delegate Handler Functions
97:
98: /// <summary>
99: /// Data source procedure for reading data from a file.
100: /// This function is provided as an argument to the constructor.
101: /// </summary>
102: /// <param name="buff"></param>
103: /// <param name="nChars"></param>
104: /// <param name="arg"></param>
105: /// <returns></returns>
106: public static int StreamReader(char[] buff, int nChars, object arg)
107: {
108: StreamReader sr = arg as StreamReader;
109: int n = sr.Read(buff, _streamPos, nChars);
110: _streamPos += n;
111: return n;
112: }
113:
114: /// <summary>
115: /// Data source function for reading data from a null terminated string.
116: /// This function is provided as an argument to the constructor.
117: /// </summary>
118: /// <param name="buff"></param>
119: /// <param name="nChars"></param>
120: /// <param name="arg"></param>
121: /// <returns></returns>
122: public static int StringReader(char[] buff, int nChars, object arg)
123: {
124: string c = arg as string;
125: int n;
126:
127: for (n = _stringPos; n < nChars && n < c.Length; n++)
128: buff[n] = c[n];
129:
130: if (_stringPos != n)
131: _stringPos += n;
132: else
133: n = 0;
134:
135: return n;
136: }
137:
138: #endregion
139:
140: #region Public Methods
141:
142: /// <summary>
143: /// Reads a single word from the stream. The string returned by this
144: /// function is guaranteed to be valid only until the next invocation
145: /// of 'GetWord'.
146: /// </summary>
147: /// <returns>The next word in the datasource</returns>
148: /// <remarks>
149: /// The tokenizer has the following properties:
150: /// - Blank lines are ignored
151: /// - Whitespace before or after words are ignored
152: /// - Comments can be introduced by a '#' sign
153: /// - 'EndOfLine' can be used to test whether the word
154: /// was the last word of a line
155: /// - 'EndOfFile' is guaranteed to return true if there
156: /// are no other words in the file and the next call to
157: /// 'GetWord' would fail
158: /// </remarks>
159: public string GetWord()
160: {
161: char c;
162: int charClass, emitWord, saveChar;
163:
164: /* The tokenizer is implemented as a seven-state state machine. State
165: * zero is the inital state, state six is the exit state. The states
166: * are roughly described as follows:
167: *
168: * 0 - Inital state: Eat whitespace
169: * 1 - Eat comments #1
170: * 2 - Save word characters
171: * 3 - Eat whitespace
172: * 4 - Eat whitespace and newlines
173: * 5 - Eat comments #2
174: * 6 - End-of-file reached
175: *
176: * States 1 and 5 are subtely different and represent knowledge about
177: * whether there is a word in the word buffer which is waiting to be
178: * returned. In general, the state machine can only return a word after
179: * it has examined all the characters preceeding the starting character
180: * of the next word. This is because the tokenizer must properly
181: * indicate that a word is the last valid word in the stream even if
182: * that word happens to be followed by several empty lines and/or
183: * comments. The need to defer this decision accounts for much of the
184: * complexity of the state machine.
185: *
186: * The rows of the state transition table represent states while the
187: * columns corresponds to one of five character classes:
188: *
189: * 0 - Eof reached (not a character)
190: * 1 - whitespace (' ' or '\t')
191: * 2 - newlines ('\n' or '\r')
192: * 3 - Comment initiator ('#')
193: * 4 - Word char (everything else)
194: */
195:
196:
197: /* Prepare to read new word into word_buff */
198: this.word_buff = new char[this.word_buff.Length];
199: this.wPos = 0;// tok->word_buff;
200:
201: /* Begin processing characters according to
202: * the transition rules */
203:
204: while (true)
205: {
206: /* Read in a new chunk of data, if needed */
207:
208: if (this.availChars == 0)
209: {
210: this.availChars = this.dataSource(this.in_buff, this.in_buff.Length, this.dataSrcArg);
211: this.iPos = 0;
212: }
213:
214: /* Look at the current character and find its character class */
215:
216: c = this.in_buff[iPos];
217:
218: if (this.availChars == 0)
219: charClass = 0;
220: else if (c == ' ' || c == '\t')
221: charClass = 1;
222: else if (c == '\n' || c == '\r')
223: charClass = 2;
224: else if (c == '#')
225: charClass = 3;
226: else
227: charClass = 4;
228:
229: /* Transition to next state based on the character on input */
230:
231: saveChar = (stateActionTable[this.state, charClass] & 0x01);
232: emitWord = (stateActionTable[this.state, charClass] & 0x02);
233: this.endOfFile = (stateActionTable[this.state, charClass] & 0x04);
234: this.endOfLine = (stateActionTable[this.state, charClass] & 0x08);
235: this.state = stateTransitionTable[this.state, charClass];
236:
237: /* If we transitioned to an accepting state, break out
238: * of the loop so that we can return the token to the caller
239: */
240:
241: if (emitWord != 0)
242: break;
243:
244: /* If the transition requires it, copy the character into the
245: * word buffer */
246:
247: if (saveChar != 0)
248: {
249: this.word_buff[this.wPos++] = c;
250: }
251: /* Advance to the next character in the stream */
252: this.iPos++;
253: this.availChars--;
254: }
255:
256: /* Add a NULL terminator, then return the contents of the word
257: * buffer to the caller */
258:
259: this.word_buff[wPos] = '\0';
260: return new string(this.word_buff);
261: }
262:
263: #endregion
264:
265: #region Properties
266:
267: /// <summary>
268: /// Queries whether the last call to 'GetWord' returned the last
269: /// valid word of the file or stream
270: /// </summary>
271: public int EndOfFile
272: {
273: get
274: {
275: return this.endOfFile;
276: }
277: }
278:
279: /// <summary>
280: /// Queries whether the last call to 'GetWord' returned the last
281: /// word on a line (the last word of the file is always said to be on the
282: /// end of the line, whether or not that line is followed by a newline)
283: /// </summary>
284: public int EndOfLine
285: {
286: get { return this.endOfLine; }
287: }
288:
289: #endregion
290:
291: #region Test-Function
292:
293: /// <summary>
294: /// This function tests some of the key functionality of the tokenizer.
295: /// It aborts the program is something goes awry, but has no side
296: /// effects if everything checks out
297: /// </summary>
298: public static void TestSuite()
299: {
300: string msg = "Tokenizer test failed";
301: string test = "Hello, World!";
302:
303: Tokenizer tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
304:
305: Debug.Assert(tok.GetWord().CompareTo("Hello,") == 0, msg);
306: Debug.Assert(tok.EndOfLine == 0, msg);
307: Debug.Assert(tok.EndOfFile == 0, msg);
308: Debug.Assert(tok.GetWord().CompareTo("World!") == 0, msg);
309: Debug.Assert(tok.EndOfLine != 0, msg);
310: Debug.Assert(tok.EndOfFile != 0, msg);
311:
312: test = "\n\nYogi # Bear ";
313: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
314: Debug.Assert(tok.GetWord().CompareTo("Yogi") == 0, msg);
315: Debug.Assert(tok.EndOfLine != 0, msg);
316: Debug.Assert(tok.EndOfFile != 0, msg);
317:
318: test = "";
319: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
320: Debug.Assert(tok.GetWord().CompareTo("") == 0, msg);
321: Debug.Assert(tok.EndOfLine == 0, msg);
322: Debug.Assert(tok.EndOfFile != 0, msg);
323:
324: test = "\n \na\tb##b\n#d";
325: tok = new Tokenizer(new TokenizerDataSourceProc(Tokenizer.StringReader), test);
326: Debug.Assert(tok.GetWord().CompareTo("a") == 0, msg);
327: Debug.Assert(tok.EndOfLine == 0, msg);
328: Debug.Assert(tok.EndOfFile == 0, msg);
329:
330: Debug.Assert(tok.GetWord().CompareTo("b") == 0, msg);
331: Debug.Assert(tok.EndOfLine != 0, msg);
332: Debug.Assert(tok.EndOfFile != 0, msg);
333: }
334:
335: #endregion
336: }
No comments:
Post a Comment