Bump Microsoft.NET.Test.Sdk from 16.8.0 to 16.8.3
1 |
using System; |
|
2 |
using System.Collections.Generic; |
|
3 |
using System.Text; |
|
4 |
using System.Linq; |
|
5 |
|
|
6 |
namespace TurnerSoftware.RobotsExclusionTools.Tokenization.TokenParsers |
|
7 |
{
|
|
8 |
public class RobotsEntryTokenParser : IRobotsFileTokenParser |
|
9 |
{
|
|
10 |
private const string UserAgentField = "User-agent"; |
|
11 |
private const string DisallowField = "Disallow"; |
|
12 |
private const string AllowField = "Allow"; |
|
13 |
private const string CrawlDelayField = "Crawl-delay"; |
|
14 |
private const string SitemapField = "Sitemap"; |
|
15 |
|
|
16 | 1 |
private static readonly HashSet<string> ExpectedFields = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase) |
17 | 1 |
{
|
18 | 1 |
UserAgentField, |
19 | 1 |
DisallowField, |
20 | 1 |
AllowField, |
21 | 1 |
CrawlDelayField
|
22 | 1 |
};
|
23 |
|
|
24 |
private class SiteAccessParseState |
|
25 |
{
|
|
26 | 1 |
public List<string> UserAgents { get; } = new List<string>(); |
27 | 1 |
public List<SiteAccessPathRule> PathRules { get; } = new List<SiteAccessPathRule>(); |
28 | 1 |
public int? CrawlDelay { get; set; } |
29 |
|
|
30 |
public void Reset() |
|
31 | 1 |
{
|
32 | 1 |
UserAgents.Clear(); |
33 | 1 |
PathRules.Clear(); |
34 | 1 |
CrawlDelay = null; |
35 | 1 |
}
|
36 |
|
|
37 |
public SiteAccessEntry AsEntry() |
|
38 | 1 |
{
|
39 | 1 |
return new SiteAccessEntry |
40 | 1 |
{
|
41 | 1 |
UserAgents = new List<string>(UserAgents), |
42 | 1 |
PathRules = new List<SiteAccessPathRule>(PathRules), |
43 | 1 |
CrawlDelay = CrawlDelay |
44 | 1 |
};
|
45 | 1 |
}
|
46 |
}
|
|
47 |
|
|
48 |
public IEnumerable<SiteAccessEntry> GetSiteAccessEntries(IEnumerable<Token> tokens) |
|
49 | 1 |
{
|
50 | 1 |
var result = new List<SiteAccessEntry>(); |
51 | 1 |
var parseState = new SiteAccessParseState(); |
52 | 1 |
var comparer = StringComparer.OrdinalIgnoreCase; |
53 |
|
|
54 | 1 |
using (var enumerator = tokens.GetEnumerator()) |
55 | 1 |
{
|
56 | 1 |
string lastFieldValue = null; |
57 |
while (enumerator.MoveTo(TokenType.Field)) |
|
58 | 1 |
{
|
59 | 1 |
var fieldValue = enumerator.Current.Value; |
60 |
|
|
61 |
if (!ExpectedFields.Contains(fieldValue)) |
|
62 | 1 |
{
|
63 | 1 |
continue; |
64 |
}
|
|
65 |
|
|
66 |
//Reset the state when we have encountered a new "User-agent" field not immediately after another
|
|
67 |
if (!string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(lastFieldValue, UserAgentField) && comparer.Equals(fieldValue, UserAgentField)) |
|
68 | 1 |
{
|
69 | 1 |
result.Add(parseState.AsEntry()); |
70 | 1 |
parseState.Reset(); |
71 | 1 |
}
|
72 |
|
|
73 |
//When we have seen a field for the first time that isn't a User-agent, default to all User-agents (written as "*")
|
|
74 |
if (string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(fieldValue, UserAgentField)) |
|
75 | 1 |
{
|
76 | 1 |
parseState.UserAgents.Add("*"); |
77 | 1 |
}
|
78 |
|
|
79 | 1 |
lastFieldValue = fieldValue; |
80 |
|
|
81 |
if (comparer.Equals(fieldValue, UserAgentField)) |
|
82 | 1 |
{
|
83 |
if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) |
|
84 | 1 |
{
|
85 |
//NOTE: Doesn't evaluate the strict "agent" definition in Section 4 of RFC
|
|
86 |
// While trimming the value avoids some issues, it isn't a char-for-char accurate
|
|
87 |
// interpretation of the RFC and thus, is limited.
|
|
88 | 1 |
parseState.UserAgents.Add(enumerator.Current.Value.Trim()); |
89 | 1 |
}
|
90 | 1 |
}
|
91 |
else if (comparer.Equals(fieldValue, AllowField) || comparer.Equals(fieldValue, DisallowField)) |
|
92 | 1 |
{
|
93 |
var pathRule = comparer.Equals(fieldValue, DisallowField) ? PathRuleType.Disallow : PathRuleType.Allow; |
|
94 | 1 |
var pathValue = string.Empty; |
95 |
|
|
96 |
if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) |
|
97 | 1 |
{
|
98 |
//NOTE: As with the User-agent values, this doesn't evaluate the "strict" definition in the RFC
|
|
99 |
// Paths have specific limitations about characters that are and aren't allowed
|
|
100 | 1 |
pathValue = enumerator.Current.Value.Trim(); |
101 | 1 |
}
|
102 |
|
|
103 |
if (pathRule == PathRuleType.Allow && string.IsNullOrEmpty(pathValue)) |
|
104 |
{
|
|
105 |
//Only disallow can be blank (no "Value" token) - See Section 4 of RFC
|
|
106 |
continue; |
|
107 |
}
|
|
108 |
|
|
109 | 1 |
parseState.PathRules.Add(new SiteAccessPathRule |
110 | 1 |
{
|
111 | 1 |
RuleType = pathRule, |
112 | 1 |
Path = pathValue |
113 | 1 |
});
|
114 | 1 |
}
|
115 |
else if (comparer.Equals(fieldValue, CrawlDelayField)) |
|
116 | 1 |
{
|
117 |
if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) |
|
118 | 1 |
{
|
119 |
if (int.TryParse(enumerator.Current.Value, out var parsedCrawlDelay)) |
|
120 | 1 |
{
|
121 | 1 |
parseState.CrawlDelay = parsedCrawlDelay; |
122 | 1 |
}
|
123 | 1 |
}
|
124 | 1 |
}
|
125 | 1 |
}
|
126 |
|
|
127 | 1 |
result.Add(parseState.AsEntry()); |
128 | 1 |
}
|
129 |
|
|
130 | 1 |
return result; |
131 | 1 |
}
|
132 |
|
|
133 |
public IEnumerable<SitemapUrlEntry> GetSitemapUrlEntries(IEnumerable<Token> tokens) |
|
134 | 1 |
{
|
135 | 1 |
var result = new List<SitemapUrlEntry>(); |
136 |
|
|
137 | 1 |
using (var enumerator = tokens.GetEnumerator()) |
138 | 1 |
{
|
139 |
while (enumerator.MoveTo(TokenType.Field, SitemapField)) |
|
140 | 1 |
{
|
141 |
if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) |
|
142 | 1 |
{
|
143 |
if (Uri.TryCreate(enumerator.Current.Value, UriKind.Absolute, out var createdUri)) |
|
144 | 1 |
{
|
145 | 1 |
result.Add(new SitemapUrlEntry |
146 | 1 |
{
|
147 | 1 |
Sitemap = createdUri |
148 | 1 |
});
|
149 | 1 |
}
|
150 | 1 |
}
|
151 | 1 |
}
|
152 | 1 |
}
|
153 |
|
|
154 | 1 |
return result; |
155 | 1 |
}
|
156 |
}
|
|
157 |
}
|
Read our documentation on viewing source code .