mirror of
https://github.com/chylex/Nextcloud-News.git
synced 2025-01-18 11:45:59 +01:00
0dfd93f75d
The league/uri version that we inherit in Nextcloud is a bit outdated. That version can't handle certain uris. Signed-off-by: Benjamin Brahmer <info@b-brahmer.de>
111 lines
3.3 KiB
PHP
111 lines
3.3 KiB
PHP
<?php
|
|
/**
|
|
* Nextcloud - News
|
|
*
|
|
* This file is licensed under the Affero General Public License version 3 or
|
|
* later. See the COPYING file.
|
|
*
|
|
* @author Gioele Falcetti <thegio.f@gmail.com>
|
|
* @copyright 2019 Gioele Falcetti
|
|
*/
|
|
|
|
namespace OCA\News\Scraper;
|
|
|
|
use fivefilters\Readability\Readability;
|
|
use fivefilters\Readability\Configuration;
|
|
use fivefilters\Readability\ParseException;
|
|
use League\Uri\Exceptions\SyntaxError;
|
|
use Psr\Log\LoggerInterface;
|
|
|
|
class Scraper implements IScraper
|
|
{
|
|
private $logger;
|
|
private $config;
|
|
private $readability;
|
|
private $curl_opts;
|
|
|
|
public function __construct(LoggerInterface $logger)
|
|
{
|
|
$this->logger = $logger;
|
|
$this->config = new Configuration([
|
|
'FixRelativeURLs' => true,
|
|
'SummonCthulhu' => true, // Remove <script>
|
|
]);
|
|
$this->readability = null;
|
|
|
|
$this->curl_opts = array(
|
|
CURLOPT_RETURNTRANSFER => true, // return web page
|
|
CURLOPT_HEADER => false, // do not return headers
|
|
CURLOPT_FOLLOWLOCATION => true, // follow redirects
|
|
//CURLOPT_USERAGENT => "php-news", // who am i
|
|
CURLOPT_AUTOREFERER => true, // set referer on redirect
|
|
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
|
|
CURLOPT_TIMEOUT => 120, // timeout on response
|
|
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
|
|
);
|
|
}
|
|
|
|
private function getHTTPContent(string $url): array
|
|
{
|
|
$handler = curl_init($url);
|
|
curl_setopt_array($handler, $this->curl_opts);
|
|
$content = curl_exec($handler);
|
|
$header = curl_getinfo($handler);
|
|
curl_close($handler);
|
|
|
|
// Update the url after the redirects has been followed
|
|
$url = $header['url'];
|
|
return array($content, $header['url']);
|
|
}
|
|
|
|
public function scrape(string $url): bool
|
|
{
|
|
list($content, $redirected_url) = $this->getHTTPContent($url);
|
|
if ($content === false) {
|
|
$this->logger->error('Unable to receive content from {url}', [
|
|
'url' => $url,
|
|
]);
|
|
$this->readability = null;
|
|
return false;
|
|
}
|
|
|
|
// Update URL used to convert relative URLs
|
|
$this->config->setOriginalURL($redirected_url);
|
|
$this->readability = new Readability($this->config);
|
|
|
|
try {
|
|
$this->readability->parse($content);
|
|
} catch (ParseException | SyntaxError $e) {
|
|
$this->logger->error('Unable to parse content from {url}', [
|
|
'url' => $url,
|
|
]);
|
|
$this->logger->debug('Error during parsing of {url} ran into {error}', [
|
|
'url' => $url,
|
|
'error' => $e,
|
|
]);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public function getContent(): ?string
|
|
{
|
|
if ($this->readability === null) {
|
|
return null;
|
|
}
|
|
return $this->readability->getContent();
|
|
}
|
|
|
|
public function getRTL(bool $default = false): bool
|
|
{
|
|
if ($this->readability === null) {
|
|
return $default;
|
|
}
|
|
|
|
$RTL = $this->readability->getDirection();
|
|
if ($RTL === null) {
|
|
return $default;
|
|
}
|
|
return $RTL === "rtl";
|
|
}
|
|
}
|