Pisałem kiedyś coś takiego.
Kodu dużo, przygotowany do użycia cookie jeśli jest strona na Wortpress lub basic auth. Spróbuj przeczytać kod, poustawiać sobie zmienne. Względnie na bazie tego spróbuj coś napisać. Rozumiem, że przy okazji pobrania tych pdefów chcesz się jeszcze czegoś nauczyć?
Kopiuj
<?php
declare(strict_types=1);
/**
* Site crawler for downloading files (PHP 8.4).
*
* Key features:
* - BFS crawl within the same host as START_URL
* - Optional HTTP Basic Auth for selected URL path prefixes (BASIC_AUTH_PREFIXES)
* - Optional WordPress "post password" support (WP_POST_PASSWORD) via cookie wp-postpass_*
* - Downloads files by extension list (DOWNLOAD_EXTENSIONS) and by content-type fallback (image/*, application/pdf)
* - Saves files into directories mirroring the URL path under OUTPUT_DIR/<host>/...
* - Extracts links from: a[href], img[src], link[href], source[src], iframe[src], embed[src], object[data], and srcset
*/
final class Config
{
public string $startUrl;
public string $outputDir;
public ?string $basicUser;
public ?string $basicPass;
/** @var list<string> */
public array $basicPrefixes;
public ?string $wpPostPassword;
/** @var list<string> */
public array $downloadExtensions;
public int $maxPages;
public int $maxDepth;
public int $timeoutSec;
public string $userAgent;
public string $cookieFile;
public function __construct()
{
$this->startUrl = trim(getenv('START_URL') ?: '');
$this->outputDir = rtrim(getenv('OUTPUT_DIR') ?: '/data', '/');
$u = getenv('BASIC_AUTH_USER');
$p = getenv('BASIC_AUTH_PASS');
$this->basicUser = ($u !== false && $u !== '') ? $u : null;
$this->basicPass = ($p !== false && $p !== '') ? $p : null;
$prefixes = getenv('BASIC_AUTH_PREFIXES') ?: '';
$this->basicPrefixes = array_values(array_filter(array_map(
static fn(string $s) => trim($s),
explode(',', $prefixes)
), static fn(string $s) => $s !== ''));
$wp = getenv('WP_POST_PASSWORD');
$this->wpPostPassword = ($wp !== false && $wp !== '') ? $wp : null;
$exts = getenv('DOWNLOAD_EXTENSIONS') ?: 'pdf,jpg,jpeg,png,gif,webp';
$this->downloadExtensions = array_values(array_filter(array_map(
static fn(string $s) => strtolower(trim($s)),
explode(',', $exts)
), static fn(string $s) => $s !== ''));
$this->maxPages = (int)(getenv('MAX_PAGES') ?: 20000);
$this->maxDepth = (int)(getenv('MAX_DEPTH') ?: 50);
$this->timeoutSec = (int)(getenv('REQUEST_TIMEOUT_SEC') ?: 20);
$this->userAgent = getenv('USER_AGENT') ?: 'SiteCrawler/2.0';
$this->cookieFile = '/tmp/crawler_cookies.txt';
}
}
final class HttpResponse
{
/** @param array<string,list<string>> $headers */
public function __construct(
public int $status,
public array $headers,
public string $body,
public string $effectiveUrl
) {}
}
final class Crawler
{
private Config $cfg;
/** @var array<string,bool> */
private array $visited = [];
/** @var array<string,bool> */
private array $downloaded = [];
private string $startHost;
private string $startScheme;
public function __construct(Config $cfg)
{
$this->cfg = $cfg;
$startParts = parse_url($cfg->startUrl);
if (!is_array($startParts) || empty($startParts['host']) || empty($startParts['scheme'])) {
throw new RuntimeException("Invalid START_URL: {$cfg->startUrl}");
}
$this->startHost = strtolower((string)$startParts['host']);
$this->startScheme = strtolower((string)$startParts['scheme']);
if (!is_dir($cfg->outputDir) && !mkdir($cfg->outputDir, 0775, true)) {
throw new RuntimeException("Cannot create OUTPUT_DIR: {$cfg->outputDir}");
}
if (!file_exists($cfg->cookieFile)) {
@file_put_contents($cfg->cookieFile, "");
}
}
public function run(): void
{
$queue = new SplQueue();
$queue->enqueue([$this->normalizeUrl($this->cfg->startUrl), 0]);
$pagesProcessed = 0;
while (!$queue->isEmpty()) {
/** @var array{0:string,1:int} $item */
$item = $queue->dequeue();
[$url, $depth] = $item;
if ($depth > $this->cfg->maxDepth) {
continue;
}
if (isset($this->visited[$url])) {
continue;
}
$this->visited[$url] = true;
$pagesProcessed++;
if ($pagesProcessed > $this->cfg->maxPages) {
$this->log("Reached MAX_PAGES={$this->cfg->maxPages}. Stopping.");
break;
}
$this->log("[$pagesProcessed] depth=$depth GET $url");
$resp = $this->httpGet($url);
$this->log(" status={$resp->status} effective={$resp->effectiveUrl}");
if ($resp === null) {
continue;
}
$effective = $this->normalizeUrl($resp->effectiveUrl);
if (!$this->isSameHost($effective)) {
continue;
}
$contentType = strtolower(trim(explode(';', $this->getHeaderValue($resp->headers, 'content-type') ?? '')[0] ?? ''));
if ($this->isDownloadableUrl($effective, $contentType)) {
$this->saveFile($effective, $resp->body, $contentType);
continue;
}
if ($resp->status < 200 || $resp->status >= 300) {
continue;
}
if (!str_contains($contentType, 'text/html') && !str_contains($contentType, 'application/xhtml')) {
continue;
}
if ($this->cfg->wpPostPassword !== null && $this->looksLikeWpProtected($resp->body)) {
$this->log(" WP protected page detected -> submitting post password cookie");
$ok = $this->submitWpPostPassword();
if ($ok) {
$resp2 = $this->httpGet($effective);
$this->log(" status={$resp2->status} effective={$resp2->effectiveUrl}");
if ($resp2 !== null) {
$resp = $resp2;
$contentType = strtolower(trim(explode(';', $this->getHeaderValue($resp->headers, 'content-type') ?? '')[0] ?? ''));
if (!str_contains($contentType, 'text/html') && !str_contains($contentType, 'application/xhtml')) {
continue;
}
}
}
}
$links = $this->extractLinks($resp->body, $effective);
foreach ($links as $link) {
$link = $this->normalizeUrl($link);
if (!$this->isSameHost($link)) {
continue;
}
if ($this->shouldSkipUrl($link)) {
continue;
}
if ($this->isDownloadableUrl($link, null)) {
if (!isset($this->downloaded[$link])) {
$fileResp = $this->httpGet($link);
$this->log(" status={$fileResp->status} effective={$fileResp->effectiveUrl}");
if ($fileResp !== null) {
$eff2 = $this->normalizeUrl($fileResp->effectiveUrl);
if ($this->isSameHost($eff2)) {
$ct2 = strtolower(trim(explode(';', $this->getHeaderValue($fileResp->headers, 'content-type') ?? '')[0] ?? ''));
if ($this->isDownloadableUrl($eff2, $ct2)) {
$this->saveFile($eff2, $fileResp->body, $ct2);
}
}
}
}
continue;
}
if (!isset($this->visited[$link])) {
$queue->enqueue([$link, $depth + 1]);
}
}
}
$this->log("Done. Visited pages: " . count($this->visited) . ", downloaded files: " . count($this->downloaded));
}
private function httpGet(string $url): ?HttpResponse
{
$ch = curl_init($url);
if ($ch === false) {
return null;
}
/** @var array<string,list<string>> $headersOut */
$headersOut = [];
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_CONNECTTIMEOUT => $this->cfg->timeoutSec,
CURLOPT_TIMEOUT => $this->cfg->timeoutSec,
CURLOPT_USERAGENT => $this->cfg->userAgent,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_COOKIEJAR => $this->cfg->cookieFile,
CURLOPT_COOKIEFILE => $this->cfg->cookieFile,
CURLOPT_HEADERFUNCTION => static function ($ch, string $headerLine) use (&$headersOut): int {
$len = strlen($headerLine);
$headerLine = trim($headerLine);
if ($headerLine === '' || !str_contains($headerLine, ':')) {
return $len;
}
[$name, $value] = explode(':', $headerLine, 2);
$name = strtolower(trim($name));
$value = trim($value);
$headersOut[$name][] = $value;
return $len;
},
]);
if ($this->needsBasicAuth($url) && $this->cfg->basicUser !== null && $this->cfg->basicPass !== null) {
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_USERPWD, $this->cfg->basicUser . ':' . $this->cfg->basicPass);
}
$body = curl_exec($ch);
if ($body === false) {
$this->log(" curl error: " . curl_error($ch));
curl_close($ch);
return null;
}
$status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$effectiveUrl = (string)curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
return new HttpResponse($status, $headersOut, (string)$body, $effectiveUrl);
}
private function needsBasicAuth(string $url): bool
{
if ($this->cfg->basicUser === null || $this->cfg->basicPass === null) {
return false;
}
if ($this->cfg->basicPrefixes === []) {
return false;
}
$parts = parse_url($url);
$path = is_array($parts) && isset($parts['path']) ? (string)$parts['path'] : '/';
foreach ($this->cfg->basicPrefixes as $prefix) {
$p = $prefix;
if ($p === '') continue;
if ($p[0] !== '/') $p = '/' . $p;
if (str_starts_with($path, $p)) {
return true;
}
}
return false;
}
private function looksLikeWpProtected(string $html): bool
{
return str_contains($html, 'post_password') && str_contains($html, 'wp-login.php?action=postpass');
}
private function submitWpPostPassword(): bool
{
if ($this->cfg->wpPostPassword === null || $this->cfg->wpPostPassword === '') {
return false;
}
$action = $this->startScheme . '://' . $this->startHost . '/wp-login.php?action=postpass';
$ch = curl_init($action);
if ($ch === false) {
return false;
}
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query(['post_password' => $this->cfg->wpPostPassword]),
CURLOPT_CONNECTTIMEOUT => $this->cfg->timeoutSec,
CURLOPT_TIMEOUT => $this->cfg->timeoutSec,
CURLOPT_USERAGENT => $this->cfg->userAgent,
CURLOPT_SSL_VERIFYPEER => true,
CURLOPT_SSL_VERIFYHOST => 2,
CURLOPT_COOKIEJAR => $this->cfg->cookieFile,
CURLOPT_COOKIEFILE => $this->cfg->cookieFile,
]);
$body = curl_exec($ch);
if ($body === false) {
$this->log(" WP postpass submit curl error: " . curl_error($ch));
curl_close($ch);
return false;
}
$status = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
curl_close($ch);
return $status >= 200 && $status < 400;
}
private function isDownloadableUrl(string $url, ?string $contentType): bool
{
$parts = parse_url($url);
$path = is_array($parts) && isset($parts['path']) ? strtolower((string)$parts['path']) : '';
$ext = strtolower(pathinfo($path, PATHINFO_EXTENSION));
if ($ext !== '' && in_array($ext, $this->cfg->downloadExtensions, true)) {
return true;
}
if ($contentType !== null && $contentType !== '') {
$ct = strtolower(trim(explode(';', $contentType)[0]));
if ($ct === 'application/pdf') return true;
if (str_starts_with($ct, 'image/')) return true;
}
return false;
}
private function saveFile(string $url, string $data, ?string $contentType): void
{
$url = $this->normalizeUrl($url);
if (isset($this->downloaded[$url])) {
return;
}
$this->downloaded[$url] = true;
$parts = parse_url($url);
$path = is_array($parts) && isset($parts['path']) ? (string)$parts['path'] : '/file';
$path = $this->normalizePath($path);
$filename = basename($path);
if ($filename === '' || $filename === '/' || $filename === '.') {
$filename = 'file';
}
if (!str_contains($filename, '.')) {
$guess = $this->guessExtensionFromContentType($contentType);
if ($guess !== null) {
$filename .= '.' . $guess;
}
}
$dirPath = dirname($path);
if ($dirPath === DIRECTORY_SEPARATOR) {
$dirPath = '';
}
$targetDir = $this->cfg->outputDir . DIRECTORY_SEPARATOR . $this->startHost . $dirPath;
$targetDir = rtrim($targetDir, DIRECTORY_SEPARATOR);
if (!is_dir($targetDir) && !mkdir($targetDir, 0775, true)) {
$this->log(" Cannot create directory: $targetDir");
return;
}
$targetFile = $targetDir . DIRECTORY_SEPARATOR . $filename;
if (file_exists($targetFile)) {
$targetFile = $this->addSuffix($targetFile);
}
$bytes = file_put_contents($targetFile, $data);
if ($bytes === false) {
$this->log(" Failed saving file: $targetFile");
return;
}
$this->log(" Saved (" . $bytes . " bytes): " . $targetFile);
}
private function guessExtensionFromContentType(?string $contentType): ?string
{
if ($contentType === null || $contentType === '') return null;
$ct = strtolower(trim(explode(';', $contentType)[0]));
return match ($ct) {
'application/pdf' => 'pdf',
'image/jpeg' => 'jpg',
'image/png' => 'png',
'image/gif' => 'gif',
'image/webp' => 'webp',
default => null,
};
}
private function addSuffix(string $filePath): string
{
$dir = dirname($filePath);
$base = basename($filePath);
$dot = strrpos($base, '.');
$name = $dot === false ? $base : substr($base, 0, $dot);
$ext = $dot === false ? '' : substr($base, $dot);
for ($i = 2; $i < 9999; $i++) {
$candidate = $dir . DIRECTORY_SEPARATOR . $name . '_' . $i . $ext;
if (!file_exists($candidate)) {
return $candidate;
}
}
return $filePath;
}
/** @return list<string> */
private function extractLinks(string $html, string $baseUrl): array
{
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$loaded = $dom->loadHTML('<?xml encoding="utf-8" ?>' . $html);
if (!$loaded) {
return [];
}
$xpath = new DOMXPath($dom);
$urls = [];
$collectAttr = function(string $query, string $attr) use ($xpath, &$urls, $baseUrl): void {
foreach ($xpath->query($query) as $node) {
if (!($node instanceof DOMElement)) continue;
$val = trim((string)$node->getAttribute($attr));
if ($val !== '') {
$urls[] = $this->resolveUrl($val, $baseUrl);
}
}
};
$collectAttr('//a[@href]', 'href');
$collectAttr('//link[@href]', 'href');
$collectAttr('//img[@src]', 'src');
$collectAttr('//source[@src]', 'src');
$collectAttr('//iframe[@src]', 'src');
$collectAttr('//embed[@src]', 'src');
$collectAttr('//object[@data]', 'data');
foreach ($xpath->query('//*[@srcset]') as $node) {
if (!($node instanceof DOMElement)) continue;
$srcset = trim((string)$node->getAttribute('srcset'));
if ($srcset === '') continue;
foreach (explode(',', $srcset) as $part) {
$part = trim($part);
if ($part === '') continue;
$u = trim(explode(' ', $part)[0]);
if ($u !== '') {
$urls[] = $this->resolveUrl($u, $baseUrl);
}
}
}
$clean = [];
foreach ($urls as $u) {
$u = trim($u);
if ($u === '' || str_starts_with($u, 'mailto:') || str_starts_with($u, 'javascript:') || str_starts_with($u, 'tel:')) {
continue;
}
$u = explode('#', $u, 2)[0];
if ($u !== '') $clean[] = $u;
}
return array_values(array_unique($clean));
}
private function resolveUrl(string $href, string $baseUrl): string
{
if (preg_match('~^https?://~i', $href)) {
return $href;
}
if (str_starts_with($href, '//')) {
return $this->startScheme . ':' . $href;
}
$base = parse_url($baseUrl);
if (!is_array($base) || empty($base['scheme']) || empty($base['host'])) {
return $href;
}
$scheme = (string)$base['scheme'];
$host = (string)$base['host'];
$port = isset($base['port']) ? ':' . $base['port'] : '';
$basePath = (string)($base['path'] ?? '/');
if (str_starts_with($href, '/')) {
return "{$scheme}://{$host}{$port}{$href}";
}
$dir = preg_replace('~/[^/]*$~', '/', $basePath);
$combined = $dir . $href;
$normalizedPath = $this->normalizePath($combined);
return "{$scheme}://{$host}{$port}{$normalizedPath}";
}
private function normalizePath(string $path): string
{
$parts = explode('/', $path);
$stack = [];
foreach ($parts as $p) {
if ($p === '' || $p === '.') continue;
if ($p === '..') {
array_pop($stack);
continue;
}
$stack[] = $p;
}
return '/' . implode('/', $stack);
}
private function normalizeUrl(string $url): string
{
$url = trim($url);
$url = explode('#', $url, 2)[0];
$parts = parse_url($url);
if (!is_array($parts) || empty($parts['scheme']) || empty($parts['host'])) {
return $url;
}
$scheme = strtolower((string)$parts['scheme']);
$host = strtolower((string)$parts['host']);
$port = isset($parts['port']) ? ':' . $parts['port'] : '';
$path = (string)($parts['path'] ?? '/');
$path = preg_replace('~//+~', '/', $path);
$query = isset($parts['query']) ? '?' . $parts['query'] : '';
return "{$scheme}://{$host}{$port}{$path}{$query}";
}
private function isSameHost(string $url): bool
{
$p = parse_url($url);
if (!is_array($p) || empty($p['host'])) {
return false;
}
return strtolower((string)$p['host']) === $this->startHost;
}
private function shouldSkipUrl(string $url): bool
{
$q = parse_url($url, PHP_URL_QUERY) ?? '';
if ($q !== '') {
if (str_contains($q, 'replytocom=') || str_contains($q, 'utm_')) {
return true;
}
}
return false;
}
/** @param array<string,list<string>> $headers */
private function getHeaderValue(array $headers, string $name): ?string
{
$name = strtolower($name);
if (!isset($headers[$name]) || $headers[$name] === []) {
return null;
}
return (string)$headers[$name][0];
}
private function log(string $msg): void
{
fwrite(STDERR, $msg . PHP_EOL);
}
}
// --- bootstrap ---
$cfg = new Config();
if ($cfg->startUrl === '') {
fwrite(STDERR, "Missing START_URL env.\n");
exit(1);
}
try {
(new Crawler($cfg))->run();
} catch (Throwable $e) {
fwrite(STDERR, "Fatal: " . $e->getMessage() . PHP_EOL);
exit(1);
}