One Hat Cyber Team
Your IP :
3.144.105.255
Server IP :
104.21.80.1
Server :
Linux agrigation-prod 5.15.0-67-generic #74-Ubuntu SMP Wed Feb 22 14:14:39 UTC 2023 x86_64
Server Software :
nginx/1.24.0
PHP Version :
7.4.33
Buat File
|
Buat Folder
Eksekusi
Dir :
~
/
home
/
forge
/
gftag.com
/
vendor
/
spatie
/
crawler
/
src
/
View File Name :
Crawler.php
<?php namespace Spatie\Crawler; use Generator; use GuzzleHttp\Client; use GuzzleHttp\Pool; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Uri; use GuzzleHttp\RequestOptions; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; use Spatie\Crawler\CrawlQueue\ArrayCrawlQueue; use Spatie\Crawler\CrawlQueue\CrawlQueue; use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; use Spatie\Crawler\Handlers\CrawlRequestFailed; use Spatie\Crawler\Handlers\CrawlRequestFulfilled; use Spatie\Robots\RobotsTxt; use Tree\Node\Node; class Crawler { public const DEFAULT_USER_AGENT = '*'; /** @var \GuzzleHttp\Client */ protected $client; /** @var \Psr\Http\Message\UriInterface */ protected $baseUrl; /** @var \Spatie\Crawler\CrawlObserverCollection */ protected $crawlObservers; /** @var \Spatie\Crawler\CrawlProfile */ protected $crawlProfile; /** @var int */ protected $concurrency; /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ protected $crawlQueue; /** @var int */ protected $crawledUrlCount = 0; /** @var int|null */ protected $maximumCrawlCount = null; /** @var int */ protected $maximumResponseSize = 1024 * 1024 * 2; /** @var int|null */ protected $maximumDepth = null; /** @var bool */ protected $respectRobots = true; /** @var bool */ protected $rejectNofollowLinks = true; /** @var \Tree\Node\Node */ protected $depthTree; /** @var bool */ protected $executeJavaScript = false; /** @var Browsershot */ protected $browsershot = null; /** @var \Spatie\Robots\RobotsTxt */ protected $robotsTxt = null; /** @var string */ protected $crawlRequestFulfilledClass; /** @var string */ protected $crawlRequestFailedClass; /** @var int */ protected $delayBetweenRequests = 0; /** @var array */ protected $allowedMimeTypes = []; /** @var */ protected static $defaultClientOptions = [ RequestOptions::COOKIES => true, RequestOptions::CONNECT_TIMEOUT => 10, RequestOptions::TIMEOUT => 10, RequestOptions::ALLOW_REDIRECTS => false, RequestOptions::HEADERS => [ 'User-Agent' => self::DEFAULT_USER_AGENT, ], ]; public static function create(array $clientOptions = []): Crawler { $clientOptions = (count($clientOptions)) ? $clientOptions : static::$defaultClientOptions; $client = new Client($clientOptions); return new static($client); } public function __construct(Client $client, int $concurrency = 10) { $this->client = $client; $this->concurrency = $concurrency; $this->crawlProfile = new CrawlAllUrls(); $this->crawlQueue = new ArrayCrawlQueue(); $this->crawlObservers = new CrawlObserverCollection(); $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class; $this->crawlRequestFailedClass = CrawlRequestFailed::class; } public function setConcurrency(int $concurrency): Crawler { $this->concurrency = $concurrency; return $this; } public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler { $this->maximumResponseSize = $maximumResponseSizeInBytes; return $this; } public function getMaximumResponseSize(): ?int { return $this->maximumResponseSize; } public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler { $this->maximumCrawlCount = $maximumCrawlCount; return $this; } public function getMaximumCrawlCount(): ?int { return $this->maximumCrawlCount; } public function getCrawlerUrlCount(): int { return $this->crawledUrlCount; } public function setMaximumDepth(int $maximumDepth): Crawler { $this->maximumDepth = $maximumDepth; return $this; } public function getMaximumDepth(): ?int { return $this->maximumDepth; } /** * @param int $delay The delay in milliseconds. * * @return Crawler */ public function setDelayBetweenRequests(int $delay): Crawler { $this->delayBetweenRequests = ($delay * 1000); return $this; } /** * @return int The delay in milliseconds. */ public function getDelayBetweenRequests(): int { return $this->delayBetweenRequests; } /** * @param array $types The allowed mimetypes to parse * * @return Crawler */ public function setParseableMimeTypes(array $types): Crawler { $this->allowedMimeTypes = $types; return $this; } /** * @return int The allowed mimetypes to prase */ public function getParseableMimeTypes(): array { return $this->allowedMimeTypes; } public function ignoreRobots(): Crawler { $this->respectRobots = false; return $this; } public function respectRobots(): Crawler { $this->respectRobots = true; return $this; } public function mustRespectRobots(): bool { return $this->respectRobots; } public function acceptNofollowLinks(): Crawler { $this->rejectNofollowLinks = false; return $this; } public function rejectNofollowLinks(): Crawler { $this->rejectNofollowLinks = true; return $this; } public function mustRejectNofollowLinks(): bool { return $this->rejectNofollowLinks; } public function getRobotsTxt(): RobotsTxt { return $this->robotsTxt; } public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler { $this->crawlQueue = $crawlQueue; return $this; } public function getCrawlQueue(): CrawlQueue { return $this->crawlQueue; } public function executeJavaScript(): Crawler { $this->executeJavaScript = true; return $this; } public function doNotExecuteJavaScript(): Crawler { $this->executeJavaScript = false; return $this; } public function mayExecuteJavascript(): bool { return $this->executeJavaScript; } /** * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers * * @return $this */ public function setCrawlObserver($crawlObservers): Crawler { if (! is_array($crawlObservers)) { $crawlObservers = [$crawlObservers]; } return $this->setCrawlObservers($crawlObservers); } public function setCrawlObservers(array $crawlObservers): Crawler { $this->crawlObservers = new CrawlObserverCollection($crawlObservers); return $this; } public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler { $this->crawlObservers->addObserver($crawlObserver); return $this; } public function getCrawlObservers(): CrawlObserverCollection { return $this->crawlObservers; } public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler { $this->crawlProfile = $crawlProfile; return $this; } public function getCrawlProfile(): CrawlProfile { return $this->crawlProfile; } public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler { $baseClass = CrawlRequestFulfilled::class; if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) { throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass); } $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass; return $this; } public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler { $baseClass = CrawlRequestFailed::class; if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) { throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass); } $this->crawlRequestFailedClass = $crawlRequestFailedClass; return $this; } public function setBrowsershot(Browsershot $browsershot) { $this->browsershot = $browsershot; return $this; } public function setUserAgent(string $userAgent): Crawler { $clientOptions = $this->client->getConfig(); $headers = array_change_key_case($clientOptions['headers']); $headers['user-agent'] = $userAgent; $clientOptions['headers'] = $headers; $this->client = new Client($clientOptions); return $this; } public function getUserAgent(): string { $headers = $this->client->getConfig('headers'); foreach (array_keys($headers) as $name) { if (strtolower($name) === 'user-agent') { return (string) $headers[$name]; } } return static::DEFAULT_USER_AGENT; } public function getBrowsershot(): Browsershot { if (! $this->browsershot) { $this->browsershot = new Browsershot(); } return $this->browsershot; } public function getBaseUrl(): UriInterface { return $this->baseUrl; } /** * @param \Psr\Http\Message\UriInterface|string $baseUrl */ public function startCrawling($baseUrl) { if (! $baseUrl instanceof UriInterface) { $baseUrl = new Uri($baseUrl); } if ($baseUrl->getScheme() === '') { $baseUrl = $baseUrl->withScheme('http'); } if ($baseUrl->getPath() === '') { $baseUrl = $baseUrl->withPath('/'); } $this->baseUrl = $baseUrl; $crawlUrl = CrawlUrl::create($this->baseUrl); $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url); if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent()) || ! $this->respectRobots ) { $this->addToCrawlQueue($crawlUrl); } $this->depthTree = new Node((string) $this->baseUrl); $this->startCrawlingQueue(); foreach ($this->crawlObservers as $crawlObserver) { $crawlObserver->finishedCrawling(); } } public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node { if (is_null($this->maximumDepth)) { return new Node((string) $url); } $node = $node ?? $this->depthTree; $returnNode = null; if ($node->getValue() === (string) $parentUrl) { $newNode = new Node((string) $url); $node->addChild($newNode); return $newNode; } foreach ($node->getChildren() as $currentNode) { $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); if (! is_null($returnNode)) { break; } } return $returnNode; } protected function startCrawlingQueue() { while ($this->crawlQueue->hasPendingUrls()) { $pool = new Pool($this->client, $this->getCrawlRequests(), [ 'concurrency' => $this->concurrency, 'options' => $this->client->getConfig(), 'fulfilled' => new $this->crawlRequestFulfilledClass($this), 'rejected' => new $this->crawlRequestFailedClass($this), ]); $promise = $pool->promise(); $promise->wait(); } } /** * @deprecated This function will be removed in the next major version */ public function endsWith($haystack, $needle) { return strrpos($haystack, $needle) + strlen($needle) === strlen($haystack); } protected function createRobotsTxt(UriInterface $uri): RobotsTxt { return RobotsTxt::create($uri->withPath('/robots.txt')); } protected function getCrawlRequests(): Generator { while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) { if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) { $this->crawlQueue->markAsProcessed($crawlUrl); continue; } if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) { continue; } foreach ($this->crawlObservers as $crawlObserver) { $crawlObserver->willCrawl($crawlUrl->url); } $this->crawlQueue->markAsProcessed($crawlUrl); yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url); } } public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler { if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) { return $this; } if ($this->getCrawlQueue()->has($crawlUrl->url)) { return $this; } $this->crawledUrlCount++; $this->crawlQueue->add($crawlUrl); return $this; } public function maximumCrawlCountReached(): bool { $maximumCrawlCount = $this->getMaximumCrawlCount(); if (is_null($maximumCrawlCount)) { return false; } return $this->getCrawlerUrlCount() >= $maximumCrawlCount; } }