1
using System;
2
using System.Collections.Generic;
3
using System.IO;
4
using System.Text;
5
using System.Text.RegularExpressions;
6
using System.Threading;
7
using System.Threading.Tasks;
8

9
namespace TurnerSoftware.RobotsExclusionTools.Tokenization
10
{
11
	/// <summary>
12
	/// Tokenizer based on Jack Vanlightly's "Simple Tokenizer" article: https://jack-vanlightly.com/blog/2016/2/3/creating-a-simple-tokenizer-lexer-in-c
13
	/// </summary>
14
	public abstract class TokenizerBase : ITokenizer
15
	{
16
		protected abstract IEnumerable<TokenDefinition> GetTokenDefinitions();
17

18
		public IEnumerable<Token> Tokenize(string text)
19 1
		{
20 1
			var tokens = new List<Token>();
21 1
			Tokenize(text, tokens);
22 1
			return tokens;
23 1
		}
24

25
		public IEnumerable<Token> Tokenize(TextReader reader)
26 1
		{
27 1
			var tokens = new List<Token>();
28
			string line;
29 1
			while ((line = reader.ReadLine()) != null)
30 1
			{
31 1
				Tokenize(line, tokens);
32 1
				tokens.Add(Token.NewLineToken);
33 1
			}
34 1
			return tokens;
35 1
		}
36

37
		public async Task<IEnumerable<Token>> TokenizeAsync(TextReader reader, CancellationToken cancellationToken = default)
38 1
		{
39 1
			var tokens = new List<Token>();
40
			string line;
41 1
			while ((line = await reader.ReadLineAsync()) != null)
42 1
			{
43 1
				cancellationToken.ThrowIfCancellationRequested();
44 1
				Tokenize(line, tokens);
45 1
				tokens.Add(Token.NewLineToken);
46 1
			}
47 1
			return tokens;
48 1
		}
49

50
		private void Tokenize(string text, ICollection<Token> tokenCollection)
51 1
		{
52 1
			var offset = 0;
53 1
			var numberOfChars = text.Length;
54

55 1
			var lastTokenEnd = 0;
56

57 1
			while (offset < numberOfChars)
58 1
			{
59 1
				var match = FindMatch(text, offset);
60 1
				if (match.IsMatch)
61 1
				{
62 1
					if (lastTokenEnd != offset)
63 1
					{
64 1
						tokenCollection.Add(new Token(
65 1
							TokenType.NotDefined,
66 1
							text.Substring(lastTokenEnd, offset - lastTokenEnd)
67 1
						));
68 1
					}
69

70 1
					tokenCollection.Add(new Token(match.TokenType, match.Value));
71 1
					offset += match.MatchLength;
72 1
					lastTokenEnd = offset;
73 1
				}
74
				else
75 1
				{
76 1
					offset++;
77 1
				}
78 1
			}
79 1
		}
80

81
		private TokenMatch FindMatch(string text, int offset)
82 1
		{
83 1
			foreach (var tokenDefinition in GetTokenDefinitions())
84 1
			{
85 1
				var match = tokenDefinition.Match(text, offset);
86 1
				if (match.IsMatch)
87 1
				{
88 1
					return match;
89
				}
90 1
			}
91

92 1
			return TokenMatch.NoMatch;
93 1
		}
94
	}
95
}

Read our documentation on viewing source code .

Loading