1
0
mirror of https://github.com/chylex/Nextcloud-News.git synced 2025-01-18 11:45:59 +01:00
Nextcloud-News/lib/Scraper/Scraper.php
Benjamin Brahmer 0dfd93f75d Workaround for #2048
The league/uri version that we inherit in Nextcloud is a bit outdated.
That version can't handle certain uris.

Signed-off-by: Benjamin Brahmer <info@b-brahmer.de>
2023-01-11 10:19:34 +01:00

111 lines
3.3 KiB
PHP

<?php
/**
* Nextcloud - News
*
* This file is licensed under the Affero General Public License version 3 or
* later. See the COPYING file.
*
* @author Gioele Falcetti <thegio.f@gmail.com>
* @copyright 2019 Gioele Falcetti
*/
namespace OCA\News\Scraper;
use fivefilters\Readability\Readability;
use fivefilters\Readability\Configuration;
use fivefilters\Readability\ParseException;
use League\Uri\Exceptions\SyntaxError;
use Psr\Log\LoggerInterface;
class Scraper implements IScraper
{
private $logger;
private $config;
private $readability;
private $curl_opts;
public function __construct(LoggerInterface $logger)
{
$this->logger = $logger;
$this->config = new Configuration([
'FixRelativeURLs' => true,
'SummonCthulhu' => true, // Remove <script>
]);
$this->readability = null;
$this->curl_opts = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // do not return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
//CURLOPT_USERAGENT => "php-news", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
}
private function getHTTPContent(string $url): array
{
$handler = curl_init($url);
curl_setopt_array($handler, $this->curl_opts);
$content = curl_exec($handler);
$header = curl_getinfo($handler);
curl_close($handler);
// Update the url after the redirects has been followed
$url = $header['url'];
return array($content, $header['url']);
}
public function scrape(string $url): bool
{
list($content, $redirected_url) = $this->getHTTPContent($url);
if ($content === false) {
$this->logger->error('Unable to receive content from {url}', [
'url' => $url,
]);
$this->readability = null;
return false;
}
// Update URL used to convert relative URLs
$this->config->setOriginalURL($redirected_url);
$this->readability = new Readability($this->config);
try {
$this->readability->parse($content);
} catch (ParseException | SyntaxError $e) {
$this->logger->error('Unable to parse content from {url}', [
'url' => $url,
]);
$this->logger->debug('Error during parsing of {url} ran into {error}', [
'url' => $url,
'error' => $e,
]);
}
return true;
}
public function getContent(): ?string
{
if ($this->readability === null) {
return null;
}
return $this->readability->getContent();
}
public function getRTL(bool $default = false): bool
{
if ($this->readability === null) {
return $default;
}
$RTL = $this->readability->getDirection();
if ($RTL === null) {
return $default;
}
return $RTL === "rtl";
}
}