1
<?php
2

3
namespace SilverStripe\TextExtraction\Rest;
4

5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\RequestException;
7
use GuzzleHttp\Psr7\Response;
8
use Psr\Log\LoggerInterface;
9
use SilverStripe\Core\Convert;
10
use SilverStripe\Core\Environment;
11
use SilverStripe\Core\Injector\Injector;
12

13
class TikaRestClient extends Client
14
{
15
    /**
16
     * Authentication options to be sent to the Tika server
17
     *
18
     * @var array
19
     */
20
    protected $options = ['username' => null, 'password' => null];
21

22
    /**
23
     * @var array
24
     */
25
    protected $mimes = [];
26

27
    /**
28
     *
29
     * @param string $baseUrl
30
     * @param array $config
31
     */
32 1
    public function __construct($baseUrl = '', $config = [])
33
    {
34 1
        $password = Environment::getEnv('SS_TIKA_PASSWORD');
35

36 1
        if (!empty($password)) {
37 0
            $this->options = [
38 0
                'username' => Environment::getEnv('SS_TIKA_USERNAME'),
39 0
                'password' => $password,
40
            ];
41
        }
42

43 1
        $config['base_uri'] = $baseUrl;
44

45 1
        parent::__construct($config);
46
    }
47

48
    /**
49
     * Detect if the service is available
50
     *
51
     * @return bool
52
     */
53 1
    public function isAvailable()
54
    {
55
        try {
56
            /** @var Response $result */
57 1
            $result = $this->get('/', $this->getGuzzleOptions());
58

59 0
            if ($result->getStatusCode() == 200) {
60 0
                return true;
61
            }
62 1
        } catch (RequestException $ex) {
63 1
            $msg = sprintf("Tika unavailable - %s", $ex->getMessage());
64 1
            Injector::inst()->get(LoggerInterface::class)->info($msg);
65

66 1
            return false;
67
        }
68
    }
69

70
    /**
71
     * Get version code
72
     *
73
     * @return string
74
     */
75 0
    public function getVersion()
76
    {
77
        /** @var Response $response */
78 0
        $response = $this->get('version', $this->getGuzzleOptions());
79 0
        $version = 0;
80

81
        // Parse output
82 0
        if ($response->getStatusCode() == 200
83 0
            && preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
84
        ) {
85 0
            $version = $matches['version'];
86
        }
87

88 0
        return (string) $version;
89
    }
90

91
    /**
92
     * Gets supported mime data. May include aliased mime types.
93
     *
94
     * @return array
95
     */
96 0
    public function getSupportedMimes()
97
    {
98 0
        if ($this->mimes) {
99 0
            return $this->mimes;
100
        }
101

102 0
        $response = $this->get(
103 0
            'mime-types',
104 0
            $this->getGuzzleOptions([
105
                'headers' => [
106 0
                    'Accept' => 'application/json',
107
                ],
108
            ])
109
        );
110

111 0
        return $this->mimes = Convert::json2array($response->getBody());
112
    }
113

114
    /**
115
     * Extract text content from a given file.
116
     * Logs a notice-level error if the document can't be parsed.
117
     *
118
     * @param  string $file Full filesystem path to a file to post
119
     * @return string Content of the file extracted as plain text
120
     */
121 0
    public function tika($file)
122
    {
123 0
        $text = null;
124
        try {
125
            /** @var Response $response */
126 0
            $response = $this->put(
127 0
                'tika',
128 0
                $this->getGuzzleOptions([
129
                    'headers' => [
130 0
                        'Accept' => 'text/plain',
131
                    ],
132 0
                    'body' => file_get_contents($file),
133
                ])
134
            );
135 0
            $text = $response->getBody();
136 0
        } catch (RequestException $e) {
137 0
            $msg = sprintf(
138 0
                'TikaRestClient was not able to process %s. Response: %s %s.',
139
                $file,
140 0
                $e->getResponse()->getStatusCode(),
141 0
                $e->getResponse()->getReasonPhrase()
142
            );
143
            // Only available if tika-server was started with --includeStack
144 0
            $body = $e->getResponse()->getBody();
145 0
            if ($body) {
146 0
                $msg .= ' Body: ' . $body;
147
            }
148

149 0
            Injector::inst()->get(LoggerInterface::class)->info($msg);
150
        }
151

152 0
        return (string) $text;
153
    }
154

155
    /**
156
     * Assembles an array of request options to pass to Guzzle
157
     *
158
     * @param array $options Authentication (etc) will be merged into this array and returned
159
     * @return array
160
     */
161 1
    protected function getGuzzleOptions($options = [])
162
    {
163 1
        if (!empty($this->options['username']) && !empty($this->options['password'])) {
164 0
            $options['auth'] = [
165 0
                $this->options['username'],
166 0
                $this->options['password']
167
            ];
168
        }
169 1
        return $options;
170
    }
171
}

Read our documentation on viewing source code .

Loading