1
<?php
2
/**
3
 * This file is part of the Shieldon package.
4
 *
5
 * (c) Terry L. <contact@terryl.in>
6
 *
7
 * For the full copyright and license information, please view the LICENSE
8
 * file that was distributed with this source code.
9
 * 
10
 * php version 7.1.0
11
 * 
12
 * @category  Web-security
13
 * @package   Shieldon
14
 * @author    Terry Lin <contact@terryl.in>
15
 * @copyright 2019 terrylinooo
16
 * @license   https://github.com/terrylinooo/shieldon/blob/2.x/LICENSE MIT
17
 * @link      https://github.com/terrylinooo/shieldon
18
 * @see       https://shieldon.io
19
 */
20

21
declare(strict_types=1);
22

23
namespace Shieldon\Firewall\Component;
24

25
use Shieldon\Firewall\Component\ComponentProvider;
26
use Shieldon\Firewall\Component\AllowedTrait;
27
use Shieldon\Firewall\Component\DeniedTrait;
28
use Shieldon\Firewall\IpTrait;
29

30
use function Shieldon\Firewall\get_request;
31

32
use function array_column;
33
use function array_unique;
34
use function gethostbyname;
35
use function implode;
36
use function preg_match;
37
use function strstr;
38

39
/**
40
 * TrustedBot component.
41
 */
42
class TrustedBot extends ComponentProvider
43
{
44
    /**
45
     *   Public methods       | Desctiotion
46
     *  ----------------------|---------------------------------------------
47
     *   setIp                | Set an IP address.
48
     *   getIp                | Get current set IP.
49
     *   setRdns              | Set a RDNS record for the check.
50
     *   getRdns              | Get IP resolved hostname.
51
     *  ----------------------|---------------------------------------------
52
     */
53
    use IpTrait;
54

55
    /**
56
     *   Public methods       | Desctiotion
57
     *  ----------------------|---------------------------------------------
58
     *   setAllowedItems      | Add items to the whitelist pool.
59
     *   setAllowedItem       | Add an item to the whitelist pool.
60
     *   getAllowedItems      | Get items from the whitelist pool.
61
     *   getAllowedItem       | Get an item from the whitelist pool.
62
     *   removeAllowedItem    | Remove an allowed item if exists.
63
     *   removeAllowedItems   | Remove all allowed items.
64
     *   hasAllowedItem       | Check if an allowed item exists.
65
     *   getAllowByPrefix     | Check if an allowed item exists have the same prefix.
66
     *   removeAllowByPrefix  | Remove allowed items with the same prefix.
67
     *   isAllowed            | Check if an item is allowed?
68
     *  ----------------------|---------------------------------------------
69
     */
70
    use AllowedTrait;
71

72
    /**
73
     *   Public methods       | Desctiotion
74
     *  ----------------------|---------------------------------------------
75
     *   setDeniedItems       | Add items to the blacklist pool.
76
     *   setDeniedItem        | Add an item to the blacklist pool.
77
     *   getDeniedItems       | Get items from the blacklist pool.
78
     *   getDeniedItem        | Get items from the blacklist pool.
79
     *   removeDeniedItem     | Remove a denied item if exists.
80
     *   removeDeniedItems    | Remove all denied items.
81
     *   hasDeniedItem        | Check if a denied item exists.
82
     *   getDenyWithPrefix    | Check if a denied item exists have the same prefix.
83
     *   removeDenyWithPrefix | Remove denied items with the same prefix.
84
     *   isDenied             | Check if an item is denied?
85
     *  ----------------------|---------------------------------------------
86
     */
87
    use DeniedTrait;
88

89
    /**
90
     * Constant
91
     */
92
    const STATUS_CODE = 85;
93

94
    /**
95
     * Robot's user-agent text.
96
     * 
97
     * @var string
98
     */
99
    private $userAgent = '';
100

101
    /**
102
     * Is the current access a fake robot?
103
     *
104
     * @var bool
105
     */
106
    private $isFake = false;
107

108
    /**
109
     * Constructor.
110
     */
111 3
    public function __construct()
112
    {
113 3
        $this->userAgent = get_request()->getHeaderLine('user-agent');
114

115 3
        $this->allowedList = [
116

117
            // Search engline: Google.
118
            'google_1' => [
119
                'userAgent' => 'google',
120
                'rdns'      => '.googlebot.com',
121
            ],
122
    
123
            'google_2' => [
124
                'userAgent' => 'google',
125
                'rdns'      => '.google.com',
126
            ],
127
    
128
            // Search engline: Mircosoft.
129
            'bing_1' => [
130
                'userAgent' => 'live',
131
                'rdns'      => '.live.com',
132
            ],
133
    
134
            'bing_2' => [
135
                'userAgent' => 'msn',
136
                'rdns'      => '.msn.com',
137
            ],
138
    
139
            'bing_3' => [
140
                'userAgent' => 'bing',
141
                'rdns'      => '.bing.com',
142
            ],
143
    
144
            // Search engline: Yahoo.
145
            'yahoo_1' => [
146
                'userAgent' => 'inktomisearch',
147
                'rdns'      => '.inktomisearch.com',
148
            ],
149
    
150
            'yahoo_2' => [
151
                'userAgent' => 'yahoo',
152
                'rdns'      => '.yahoo.com',
153
            ],
154
    
155
            'yahoo_3' => [
156
                'userAgent' => 'yahoo',
157
                'rdns'      => '.yahoo.net',
158
            ],
159
    
160
            // Search engine: Yandex.
161
            'yandex_1' => [
162
                'userAgent' => 'yandex',
163
                'rdns'      => '.yandex.com',
164
            ],
165
    
166
            'yandex_2' => [
167
                'userAgent' => 'yandex',
168
                'rdns'      => '.yandex.net',
169
            ],
170
    
171
            'yandex_3' => [
172
                'userAgent' => 'yandex',
173
                'rdns'      => '.yandex.ru',
174
            ],
175
    
176
            // Facebook crawlers.
177
            'facebook' => [
178
                'userAgent' => 'facebook',
179
                'rdns'      => '.fbsv.net',
180
            ],
181
    
182
            // Twitter crawlers.
183
            'twitter' => [
184
                'userAgent' => 'Twitterbot',
185
                'rdns'      => '.twttr.com', // (not twitter.com)
186
            ],
187
    
188
            // W3C validation services.
189
            'w3' => [
190
                'userAgent' => 'w3.org',
191
                'rdns'      => '.w3.org',
192
            ],
193
    
194
            // Ask.com crawlers.
195
            'ask' => [
196
                'userAgent' => 'ask',
197
                'rdns'      => '.ask.com',
198
            ],
199
        ];
200

201 3
        $this->deniedList = [];
202
    }
203

204
    /**
205
     * Check the user-agent string and rdns in the trusted list.
206
     * 
207
     * @return bool
208
     */
209 3
    public function isAllowed(): bool
210
    {
211 3
        $userAgent = array_unique(
212 3
            array_column($this->allowedList, 'userAgent')
213
        );
214

215 3
        if (!preg_match('/(' . implode('|', $userAgent) . ')/i', $this->userAgent)) {
216
            // Okay, current request's user-agent string doesn't contain our truested bots' infroamtion.
217
            // Ignore it.
218 3
            return false;
219
        }
220

221 3
        $rdns = array_unique(
222 3
            array_column($this->allowedList, 'rdns')
223
        );
224

225 3
        $rdnsCheck = false;
226

227
        // We will check the RDNS record to see if it is in the whitelist.
228 3
        if (preg_match('/(' . implode('|', $rdns) . ')/i', $this->rdns)) {
229

230
            // To prevent "fake" RDNS such as "abc.google.com.fakedomain.com" pass thorugh our checking process.
231
            // We need to check it one by one.
232 3
            foreach ($rdns as $r) {
233

234
                // For example:
235
                // $x = strstr('abc.googlebot.com.fake', '.googlebot.com');
236
                // $x will be `.googlebot.com.fake` so that we can identify this is a fake domain.
237 3
                $x = strstr($this->rdns, $r);
238

239
                // `.googlebot.com` === `.googlebot.com`
240 3
                if ($x === $r) {
241 3
                    $rdnsCheck = true;
242
                }
243
            }
244

245 3
            if ($rdnsCheck) {
246 3
                $ip = gethostbyname($this->rdns);
247

248 3
                if ($this->strictMode) {
249 3
                    if ($ip !== $this->ip) {
250
                        // If the IP is different as hostname's resolved IP. It might be a fake bot.
251 3
                        $this->isFake = true;
252 3
                        return false;
253
                    }
254
                }
255

256
            } else {
257
                // We can identify that current access uses a fake RDNS record.
258 3
                $this->isFake = true;
259 3
                return false;
260
            }
261

262 3
            return true;
263
        }
264

265
        // Here, once a request uses a user-agent that contains search engine information, but it does't pass the RDNS check.
266
        // We can identify it is fake.
267 3
        $this->isFake = true;
268

269 3
        return false;
270
    }
271

272
    /**
273
     * {@inheritDoc}
274
     * 
275
     * @return bool
276
     */
277 3
    public function isGoogle(): bool
278
    {
279 3
        if (preg_match('/(google.com|googlebot.com)/i', $this->rdns)) {
280 3
            return true;
281
        }
282

283 3
        return false;
284
    }
285

286
    /**
287
     * {@inheritDoc}
288
     * 
289
     * @return bool
290
     */
291 3
    public function isYahoo(): bool
292
    {
293 3
        if (preg_match('/(yahoo.com|yahoo.net)/i', $this->rdns)) {
294 3
            return true;
295
        }
296

297 3
        return false;
298
    }
299

300
    /**
301
     * {@inheritDoc}
302
     * 
303
     * @return bool
304
     */
305 3
    public function isBing(): bool
306
    {
307 3
        if (preg_match('/(msn.com|bing.com|live.com)/i', $this->rdns)) {
308 3
            return true;
309
        }
310

311 3
        return false;
312
    }
313

314
    /**
315
     * Not used in TrustedBots component.
316
     * 
317
     * @return bool always false.
318
     */
319 3
    public function isDenied(): bool
320
    {
321 3
        return false;
322
    }
323

324
    /**
325
     * Check if the current access a fake robot.
326
     * To get real value from this method, execution must be after `isAllowed`.
327
     *
328
     * @return bool
329
     */
330 3
    public function isFakeRobot(): bool
331
    {
332 3
        return $this->isFake;
333
    }
334

335
    /**
336
     * Unique deny status code.
337
     *
338
     * @return int
339
     */
340 3
    public function getDenyStatusCode(): int
341
    {
342 3
        return self::STATUS_CODE;
343
    }
344

345
    /**
346
     * Add new items to the allowed list.
347
     *
348
     * @param string $name      The key for this inforamtion.
349
     * @param string $useragent A piece of user-agent string that can identify.
350
     * @param string $rdns      The RDNS inforamtion of the bot.
351
     *
352
     * @return void
353
     */
354 3
    public function addTrustedBot(string $name, string $useragent, string $rdns)
355
    {
356 3
        $this->setAllowedItem(
357
            [
358 3
                'userAgent' => $useragent,
359 3
                'rdns' => $rdns,
360
            ],
361 2
            $name
362
        );
363
    }
364
}

Read our documentation on viewing source code .

Loading