Assembla home | Assembla project page
 

root/trunk/Phergie/Plugin/Url.php

Revision 279, 21.2 kB (checked in by Slynderdale, 2 months ago)

--

Line 
1 <?php
2
3 /**
4  * Monitors incoming messages for instances of URLs and responds with messages
5  * containing relevant information about detected URLs.
6  *
7  * Has an utility method accessible through $this->getPlugin('Url')->getTitle('http://foo..')
8  */
9 class Phergie_Plugin_Url extends Phergie_Plugin_Abstract_Base
10 {
11     /**
12      * Links output format
13      *
14      * Can use the variables %nick%, %title% and %link% in it to display page titles
15      * and links
16      *
17      * @var string
18      */
19     protected $baseFormat = '%nick%: %message%';
20     protected $messageFormat = '[ %link% ] %title%';
21
22     /**
23      * Merged link output
24      *
25      * If true, then multiple posted links will be merged into one line
26      *
27      * @var bool
28      */
29     protected $mergeLinks = true;
30
31     /**
32      * Max length of the fetched URL title
33      *
34      * @var int
35      */
36     protected $titleLength = 40;
37
38     /**
39      * Url cache to prevent spamming, especially with multiple bots on the same channel
40      */
41     protected $urlCache = array();
42     protected $tinyCache = array();
43
44     /**
45      * The time in seconds to store the cached entries
46      * Setting it to 0 or below disables the cache expiration
47      */
48     protected $expire = 1800;
49
50     /**
51      * The number of entries to keep in the cache at one time per channel
52      * Setting it to 0 or below disables the cache limit
53      */
54     protected $limit = 10;
55
56     /**
57      * This setting determines if URL will use a fallback when trying to open
58      * a https stream when OpenSSL isn't available, instead it will try opening
59      * a http stream instead.
60      */
61     protected $sslFallback = true;
62
63     /**
64      * Set to true by the custom error handler if an HTTP error code has been received
65      *
66      * @var boolean
67      */
68     protected $errorStatus = false;
69     protected $errorMessage = null;
70
71     /**
72      * Whether or not to display error messages as the title if a link posted
73      * encounters an error.
74      *
75      * @var boolean
76      */
77     protected $showErrors = true;
78
79     /**
80      * Whether or not to detect schemeless urls (i.e. "example.com")
81      *
82      * @var boolean
83      */
84     protected $detectSchemeless = false;
85
86     /**
87      * List of HTTP errors to return when the requested URL returns an HTTP error
88      *
89      * @var array
90      */
91     protected $httpErrors = array(
92         100 => '100 Continue',
93         200 => '200 OK',
94         201 => '201 Created',
95         204 => '204 No Content',
96         206 => '206 Partial Content',
97         300 => '300 Multiple Choices',
98         301 => '301 Moved Permanently',
99         302 => '302 Found',
100         303 => '303 See Other',
101         304 => '304 Not Modified',
102         307 => '307 Temporary Redirect',
103         400 => '400 Bad Request',
104         401 => '401 Unauthorized',
105         403 => '403 Forbidden',
106         404 => '404 Not Found',
107         405 => '405 Method Not Allowed',
108         406 => '406 Not Acceptable',
109         408 => '408 Request Timeout',
110         410 => '410 Gone',
111         413 => '413 Request Entity Too Large',
112         414 => '414 Request URI Too Long',
113         415 => '415 Unsupported Media Type',
114         416 => '416 Requested Range Not Satisfiable',
115         417 => '417 Expectation Failed',
116         500 => '500 Internal Server Error',
117         501 => '501 Method Not Implemented',
118         503 => '503 Service Unavailable',
119         506 => '506 Variant Also Negotiates'
120     );
121
122     /**
123      * An array containing a list of TLDs used for non-scheme matches
124      *
125      * @var array
126      */
127     protected $tldList = array();
128
129     /**
130      * Returns whether or not the plugin's dependencies are met.
131      *
132      * @param Phergie_Driver_Abstract $client Client instance
133      * @param array $plugins List of short names for plugins that the
134      *                       bootstrap file intends to instantiate
135      * @see Phergie_Plugin_Abstract_Base::checkDependencies()
136      * @return bool TRUE if dependencies are met, FALSE otherwise
137      */
138     public static function checkDependencies(Phergie_Driver_Abstract $client, array $plugins)
139     {
140         if (!self::staticPluginLoaded('TinyUrl', $client, $plugins)) {
141             return 'TinyUrl plugin must be enabled';
142         }
143
144         return true;
145     }
146
147     /**
148      * Initializes settings
149      *
150      * @return void
151      */
152     public function onConnect()
153     {
154         // Get a list of valid TLDs
155         if (!is_array($this->tldList) || count($this->tldList) <= 6) {
156             if ($this->pluginLoaded('Tld')) {
157                 $this->tldList = Phergie_Plugin_Tld::getTlds();
158                 if (is_array($this->tldList)) {
159                     $this->tldList = array_keys($this->tldList);
160                 }
161             }
162             if (!is_array($this->tldList) || count($this->tldList) <= 0) {
163                 $this->tldList = array('com', 'org', 'net', 'gov', 'us', 'uk');
164             }
165             rsort($this->tldList);
166         }
167     }
168
169     /**
170      * Checks an incoming message for the presence of a URL and, if one is
171      * found, responds with its title if it is an HTML document and the
172      * TinyURL equivalent of its original URL if it meets length requirements.
173      *
174      * @return void
175      */
176     public function onPrivmsg()
177     {
178         $source = $this->event->getSource();
179         $user = $this->event->getNick();
180
181         // URL Match
182         $this->updateSetting('detect_schemeless', 'detectSchemeless');
183         if (preg_match_all('#'.($this->detectSchemeless ? '' : 'https?://').'(?:([0-9]{1,3}(?:\.[0-9]{1,3}){3})(?![^/]) |
184                             ('.($this->detectSchemeless ? '(?<!http:/|https:/)[@/\\\]' : '').')?(?:(?:[a-z0-9_-]+\.?)+\.[a-z0-9]{1,6}))[^\s]*#xis',
185                             $this->event->getArgument(1), $matches, PREG_SET_ORDER)) {
186
187             // Update the settings on the fly to take into account any ini changes while the bot is running
188             $this->updateSetting('base_format', 'baseFormat');
189             $this->updateSetting('message_format', 'messageFormat');
190             $this->updateSetting('merge_links', 'mergeLinks');
191             $this->updateSetting('title_length', 'titleLength', true);
192             $this->updateSetting('show_errors', 'showErrors');
193
194             $responses = array();
195             foreach($matches as $m) {
196                 $url = trim(rtrim($m[0], ', ].?!;'));
197
198                 // Check to see if the URL was from an email address, is a directory, etc
199                 if (!empty($m[2])) {
200                     $this->debug('Invalid Url: URL is either an email or a directory path. (' . $url . ')');
201                     continue;
202                 }
203
204                 // Parse the given URL
205                 if (!$parsed = $this->parseUrl($url)) {
206                     $this->debug('Invalid Url: Could not parse the URL. (' . $url . ')');
207                     continue;
208                 }
209
210                 // Check to see if the given IP/Host is valid
211                 if (!empty($m[1]) and !$this->checkValidIP($m[1])) {
212                     $this->debug('Invalid Url: ' . $m[1] . ' is not a valid IP address. (' . $url . ')');
213                     continue;
214                 }
215
216                 // Process TLD if it's not an IP
217                 if (empty($m[1])) {
218                     // Get the TLD from the host
219                     $pos = strrpos($parsed['host'], '.');
220                     $parsed['tld'] = ($pos !== false ? substr($parsed['host'], ($pos+1)) : '');
221
222                     // Check to see if the URL has a valid TLD
223                     if (is_array($this->tldList) && !in_array(strtolower($parsed['tld']), $this->tldList)) {
224                         $this->debug('Invalid Url: ' . $parsed['tld'] . ' is not a supported TLD. (' . $url . ')');
225                         continue;
226                     }
227                 }
228
229                 // Check to see if the URL is to a secured site or not and handle it accordingly
230                 if ($parsed['scheme'] == 'https' && !extension_loaded('openssl')) {
231                     if (!$this->sslFallback) {
232                         $this->debug('Invalid Url: HTTPS is an invalid scheme, OpenSSL isn\'t available. (' . $url . ')');
233                         continue;
234                     } else {
235                         $parsed['scheme'] = 'http';
236                     }
237                 }
238
239                 if (!in_array($parsed['scheme'], array('http', 'https'))) {
240                     $this->debug('Invalid Url: ' . $parsed['scheme'] . ' is not a supported scheme. (' . $url . ')');
241                     continue;
242                 }
243                 $url = $this->glueURL($parsed);
244                 unset($parsed);
245
246                 // Convert url
247                 $tinyUrl = Phergie_Plugin_TinyUrl::get($url);
248
249                 // Prevent spamfest
250                 if ($this->checkUrlCache($url, $tinyUrl)) {
251                     $this->debug('Invalid Url: URL is in the cache. (' . $url . ')');
252                     continue;
253                 }
254
255                 $title = self::getTitle($url);
256                 if (!empty($title)) {
257                     $responses[] = str_replace(array(
258                         '%title%',
259                         '%link%',
260                         '%nick%'
261                     ), array(
262                         $title,
263                         $tinyUrl,
264                         $user
265                     ), $this->messageFormat);
266                 }
267
268                 // Update cache
269                 $this->updateUrlCache($url, $tinyUrl);
270                 unset($title, $tinyUrl, $title);
271             }
272             /**
273              * Check to see if there were any URL responses, format them and handle if they
274              * get merged into one message or not
275              */
276             if (count($responses) > 0) {
277                 if ($this->mergeLinks) {
278                     $this->doPrivmsg($source, str_replace(array(
279                         '%message%',
280                         '%nick%'
281                     ), array(
282                         implode('; ', $responses),
283                         $user
284                     ), $this->baseFormat));
285                 } else {
286                     foreach($responses as $response) {
287                         $this->doPrivmsg($source, str_replace(array(
288                             '%message%',
289                             '%nick%'
290                         ), array(
291                             $response,
292                             $user
293                         ), $this->baseFormat));
294                     }
295                 }
296             }
297         }
298     }
299
300     /**
301      * Checks a given URL and TinyURL against the cache to verify if they were
302      * previously posted on the channel.
303      *
304      * @param string $url The URL to check against
305      * @param string $tiny The TinyURL to check against
306      * @return bool
307      */
308     protected function checkUrlCache($url, $tiny)
309     {
310         $source = $this->event->getSource();
311
312         /**
313          * Transform the URL and TinyURL into a HEX CRC32 checksum to prevent potential problems
314          * and minimize the size of the cache for less cache bloat.
315          */
316         $url = $this->getUrlChecksum($url);
317         $tiny = $this->getUrlChecksum($tiny);
318
319         $cache = array(
320             'url' => isset($this->urlCache[$source][$url]) ? $this->urlCache[$source][$url] : null,
321             'tiny' => isset($this->tinyCache[$source][$tiny]) ? $this->tinyCache[$source][$tiny] : null
322         );
323
324         $expire = $this->expire;
325         /**
326          * If cache expiration is enabled, check to see if the given url has expired in the cache
327          * If expire is disabled, simply check to see if the url is listed
328          */
329         if (($expire > 0 && (($cache['url'] + $expire) > time() || ($cache['tiny'] + $expire) > time())) ||
330             ($expire <= 0 && (isset($cache['url']) || isset($cache['tiny'])))) {
331             unset($cache, $url, $tiny, $expire);
332             return true;
333         }
334         unset($cache, $url, $tiny, $expire);
335         return false;
336     }
337
338     /**
339      * Updates the cache and adds the given URL and TinyURL to the cache. It
340      * also handles cleaning the cache of old entries as well.
341      *
342      * @param string $url The URL to add to the cache
343      * @param string $tiny The TinyURL to add to the cache
344      * @return bool
345      */
346     protected function updateUrlCache($url, $tiny)
347     {
348         $source = $this->event->getSource();
349
350         /**
351          * Transform the URL and TinyURL into a HEX CRC32 checksum to prevent potential problems
352          * and minimize the size of the cache for less cache bloat.
353          */
354         $url = $this->getUrlChecksum($url);
355         $tiny = $this->getUrlChecksum($tiny);
356         $time = time();
357
358         // Handle the URL cache and remove old entries that surpass the limit if enabled
359         $this->urlCache[$source][$url] = $time;
360         if ($this->limit > 0 && count($this->urlCache[$source]) > $this->limit) {
361             asort($this->urlCache[$source], SORT_NUMERIC);
362             array_shift($this->urlCache[$source]);
363         }
364
365         // Handle the TinyURL cache and remove old entries that surpass the limit if enabled
366         $this->tinyCache[$source][$tiny] = $time;
367         if ($this->limit > 0 && count($this->tinyCache[$source]) > $this->limit) {
368             asort($this->tinyCache[$source], SORT_NUMERIC);
369             array_shift($this->tinyCache[$source]);
370         }
371         unset($url, $tiny, $time);
372     }
373
374     /**
375      * Transliterates a UTF-8 string into corresponding ASCII characters and
376      * truncates and appends an ellipsis to the string if it exceeds a given
377      * length.
378      *
379      * @param string $str String to decode
380      * @param int $trim Maximum string length, optional
381      * @return string
382      */
383     protected function decode($str, $trim = null)
384     {
385         $out = $this->decodeTranslit($str);
386         if ($trim > 0) {
387             $out = substr($out, 0, $trim) . (strlen($out) > $trim ? '...' : '');
388         }
389         return $out;
390     }
391
392     /**
393      * Custom error handler meant to handle 404 errors and such
394      */
395     public function onPhpError($errno, $errstr, $errfile, $errline)
396     {
397         if ($errno === E_WARNING) {
398             // Check to see if there was HTTP warning while connecting to the site
399             if (preg_match('{HTTP/1\.[01] ([0-9]{3})}i', $errstr, $m)) {
400                 $this->errorStatus = $m[1];
401                 $this->errorMessage = (isset($this->httpErrors[$m[1]]) ? $this->httpErrors[$m[1]] : $m[1]);
402                 $this->debug('PHP Warning:  ' . $errstr . 'in ' . $errfile . ' on line ' . $errline);
403                 return true;
404             // Safely ignore these SSL warnings so they don't appear in the log
405             } else if (stripos($errstr, 'SSL: fatal protocol error in') !== false ||
406                        stripos($errstr, 'failed to open stream') !== false ||
407                        stripos($errstr, 'HTTP request failed') !== false ||
408                        stripos($errstr, 'SSL: An existing connection was forcibly closed by the remote host') !== false ||
409                        stripos($errstr, 'Failed to enable crypto in') !== false ||
410                        stripos($errstr, 'SSL: An established connection was aborted by the software in your host machine') !== false ||
411                        stripos($errstr, 'SSL operation failed with code') !== false ||
412                        stripos($errstr, 'unable to connect to') !== false) {
413                 $this->errorStatus = true;
414                 $this->debug('PHP Warning:  ' . $errstr . 'in ' . $errfile . ' on line ' . $errline);
415                 return true;
416             }
417         }
418         return false;
419     }
420
421     /**
422      * Takes a url, parses and cleans the URL without of all the junk
423      * and then return the hex checksum of the url.
424      */
425     protected function getUrlChecksum($url)
426     {
427         $checksum = strtolower(urldecode($this->glueUrl($url, true)));
428         $checksum = preg_replace('#\s#', '', $this->decodeTranslit($checksum));
429         return dechex(crc32($checksum));
430     }
431
432     /*
433     * Parses a given URI and procceses the output to remove redundant
434     * or missing values.
435     */
436     protected function parseUrl($url)
437     {
438         if (is_array($url)) return $url;
439
440         $url = trim(ltrim($url, ' /@\\'));
441         if (!preg_match('&^(?:([a-z][-+.a-z0-9]*):)&xis', $url, $matches)) {
442             $url = 'http://' . $url;
443         }
444         $parsed = parse_url($url);
445
446         if (!isset($parsed['scheme'])) {
447             $parsed['scheme'] = 'http';
448         }
449         $parsed['scheme'] = strtolower($parsed['scheme']);
450
451         if (isset($parsed['path']) && !isset($parsed['host'])) {
452             $host = $parsed['path'];
453             $path = '';
454             if (strpos($parsed['path'], '/') !== false) {
455                 list($host, $path) = array_pad(explode('/', $parsed['path'], 2), 2, null);
456             }
457             $parsed['host'] = $host;
458             $parsed['path'] = $path;
459         }
460
461         return $parsed;
462     }
463
464     /*
465     * Parses a given URI and then glues it back together in the proper format.
466     * If base is set, then it chops off the scheme, user and pass and fragment
467     * information to return a more unique base URI.
468     */
469     protected function glueUrl($uri, $base = false)
470     {
471         $parsed = $uri;
472         if (!is_array($parsed)) {
473             $parsed = $this->parseUrl($parsed);
474         }
475
476         if (is_array($parsed)) {
477             $uri = '';
478             if (!$base) {
479                 $uri .= (!empty($parsed['scheme']) ? $parsed['scheme'] . ':' .
480                         ((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '');
481                 $uri .= (!em