1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4

5
namespace TurnerSoftware.RobotsExclusionTools.Tokenization.Tokenizers
6
{
7
	public class RobotsFileTokenizer : TokenizerBase
8
	{
9 1
		private static readonly IEnumerable<TokenDefinition> TokenDefinitions = new []
10 1
		{
11 1
			//Regex based on documented standard: http://www.robotstxt.org/norobots-rfc.txt
12 1

13 1
			//Token (aka field) is any character except CTLs or "tspecials" (see Page 6 of RFC listed above - also documented in RFC 1945)
14 1
			//Valid is between \x21 and \x7e (inclusive) but exluding the following:
15 1
			//\x22 = "
16 1
			//\x28 = (
17 1
			//\x29 = )
18 1
			//\x2c = ,
19 1
			//\x2f = /
20 1
			//\x3a = :
21 1
			//\x3b = ;
22 1
			//\x3c = <
23 1
			//\x3d = =
24 1
			//\x3e = >
25 1
			//\x3f = ?
26 1
			//\x40 = @
27 1
			//\x5b = [
28 1
			//\x5c = \
29 1
			//\x5d = ]
30 1
			//\x7b = {
31 1
			//\x7d = }
32 1
			//
33 1
			//This can be expressed as the following:
34 1
			//\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e
35 1

36 1
			new TokenDefinition(TokenType.Comment, @"\G#[^\x0A\x0D]*"),
37 1
			new TokenDefinition(TokenType.Field, @"\G[\x21\x23-\x27\x2a\x2b\x2d\x2e\x41-\x5a\x5e-\x7a\x7c\x7e]+(?=:[ ])"),
38 1
			new TokenDefinition(TokenType.FieldValueDelimiter, @"\G:[ ]"),
39 1
			new TokenDefinition(TokenType.Value, @"\G(?<=:[ ])[^\x0A\x0D#]+"),
40 1
			new TokenDefinition(TokenType.NewLine, @"\G\x0D?\x0A"),
41 1
			new TokenDefinition(TokenType.Blank, @"\G[ \t]+")
42 1
		};
43

44
		protected override IEnumerable<TokenDefinition> GetTokenDefinitions()
45 1
		{
46 1
			return TokenDefinitions;
47 1
		}
48
	}
49
}

Read our documentation on viewing source code .

Loading