1
using System;
2
using System.Collections.Generic;
3
using System.Text;
4
using System.Linq;
5

6
namespace TurnerSoftware.RobotsExclusionTools.Tokenization.TokenParsers
7
{
8
	public class RobotsEntryTokenParser : IRobotsFileTokenParser
9
	{
10
		private const string UserAgentField = "User-agent";
11
		private const string DisallowField = "Disallow";
12
		private const string AllowField = "Allow";
13
		private const string CrawlDelayField = "Crawl-delay";
14
		private const string SitemapField = "Sitemap";
15

16 1
		private static readonly HashSet<string> ExpectedFields = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase)
17 1
		{
18 1
			UserAgentField,
19 1
			DisallowField,
20 1
			AllowField,
21 1
			CrawlDelayField
22 1
		};
23

24
		private class SiteAccessParseState
25
		{
26 1
			public List<string> UserAgents { get; } = new List<string>();
27 1
			public List<SiteAccessPathRule> PathRules { get; } = new List<SiteAccessPathRule>();
28 1
			public int? CrawlDelay { get; set; }
29

30
			public void Reset()
31 1
			{
32 1
				UserAgents.Clear();
33 1
				PathRules.Clear();
34 1
				CrawlDelay = null;
35 1
			}
36

37
			public SiteAccessEntry AsEntry()
38 1
			{
39 1
				return new SiteAccessEntry
40 1
				{
41 1
					UserAgents = new List<string>(UserAgents),
42 1
					PathRules = new List<SiteAccessPathRule>(PathRules),
43 1
					CrawlDelay = CrawlDelay
44 1
				};
45 1
			}
46
		}
47

48
		public IEnumerable<SiteAccessEntry> GetSiteAccessEntries(IEnumerable<Token> tokens)
49 1
		{
50 1
			var result = new List<SiteAccessEntry>();
51 1
			var parseState = new SiteAccessParseState();
52 1
			var comparer = StringComparer.OrdinalIgnoreCase;
53

54 1
			using (var enumerator = tokens.GetEnumerator())
55 1
			{
56 1
				string lastFieldValue = null;
57 1
				while (enumerator.MoveTo(TokenType.Field))
58 1
				{
59 1
					var fieldValue = enumerator.Current.Value;
60

61 1
					if (!ExpectedFields.Contains(fieldValue))
62 1
					{
63 1
						continue;
64
					}
65

66
					//Reset the state when we have encountered a new "User-agent" field not immediately after another
67 1
					if (!string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(lastFieldValue, UserAgentField) && comparer.Equals(fieldValue, UserAgentField))
68 1
					{
69 1
						result.Add(parseState.AsEntry());
70 1
						parseState.Reset();
71 1
					}
72
					
73
					//When we have seen a field for the first time that isn't a User-agent, default to all User-agents (written as "*")
74 1
					if (string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(fieldValue, UserAgentField))
75 1
					{
76 1
						parseState.UserAgents.Add("*");
77 1
					}
78

79 1
					lastFieldValue = fieldValue;
80

81 1
					if (comparer.Equals(fieldValue, UserAgentField))
82 1
					{
83 1
						if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
84 1
						{
85
							//NOTE: Doesn't evaluate the strict "agent" definition in Section 4 of RFC
86
							//		While trimming the value avoids some issues, it isn't a char-for-char accurate
87
							//		interpretation of the RFC and thus, is limited.
88 1
							parseState.UserAgents.Add(enumerator.Current.Value.Trim());
89 1
						}
90 1
					}
91 1
					else if (comparer.Equals(fieldValue, AllowField) || comparer.Equals(fieldValue, DisallowField))
92 1
					{
93 1
						var pathRule = comparer.Equals(fieldValue, DisallowField) ? PathRuleType.Disallow : PathRuleType.Allow;
94 1
						var pathValue = string.Empty;
95

96 1
						if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
97 1
						{
98
							//NOTE: As with the User-agent values, this doesn't evaluate the "strict" definition in the RFC
99
							//		Paths have specific limitations about characters that are and aren't allowed
100 1
							pathValue = enumerator.Current.Value.Trim();
101 1
						}
102

103 1
						if (pathRule == PathRuleType.Allow && string.IsNullOrEmpty(pathValue))
104 0
						{
105
							//Only disallow can be blank (no "Value" token) - See Section 4 of RFC
106 0
							continue;
107
						}
108

109 1
						parseState.PathRules.Add(new SiteAccessPathRule
110 1
						{
111 1
							RuleType = pathRule,
112 1
							Path = pathValue
113 1
						});
114 1
					}
115 1
					else if (comparer.Equals(fieldValue, CrawlDelayField))
116 1
					{
117 1
						if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
118 1
						{
119 1
							if (int.TryParse(enumerator.Current.Value, out var parsedCrawlDelay))
120 1
							{
121 1
								parseState.CrawlDelay = parsedCrawlDelay;
122 1
							}
123 1
						}
124 1
					}
125 1
				}
126

127 1
				result.Add(parseState.AsEntry());
128 1
			}
129

130 1
			return result;
131 1
		}
132

133
		public IEnumerable<SitemapUrlEntry> GetSitemapUrlEntries(IEnumerable<Token> tokens)
134 1
		{
135 1
			var result = new List<SitemapUrlEntry>();
136

137 1
			using (var enumerator = tokens.GetEnumerator())
138 1
			{
139 1
				while (enumerator.MoveTo(TokenType.Field, SitemapField))
140 1
				{
141 1
					if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
142 1
					{
143 1
						if (Uri.TryCreate(enumerator.Current.Value, UriKind.Absolute, out var createdUri))
144 1
						{
145 1
							result.Add(new SitemapUrlEntry
146 1
							{
147 1
								Sitemap = createdUri
148 1
							});
149 1
						}
150 1
					}
151 1
				}
152 1
			}
153

154 1
			return result;
155 1
		}
156
	}
157
}

Read our documentation on viewing source code .

Loading