php-spider – скрипт пхп паука с обширным функционалом

use VDB\\Spider\\Discoverer\\XPathExpressionDiscoverer;
use Symfony\\Component\\EventDispatcher\\Event;
use VDB\\Spider\\Event\\SpiderEvents;
use VDB\\Spider\\StatsHandler;
use VDB\\Spider\\Spider;
require_once __DIR__ . '/../vendor/autoload.php';

// Create Spider
$spider = new Spider('');

// Add a URI discoverer. Without it, the spider does nothing. In this case, we want <a> tags from a certain <div>
$spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@id='catalogs']//a"));

// Set some sane options for this example. In this case, we only get the first 10 items from the start page.
$spider->getDiscovererSet()->maxDepth = 1;
$spider->getQueueManager()->maxQueueSize = 10;

// Let's add something to enable us to stop the script
    function (Event $event) {
        echo "\\nCrawl aborted by user.\\n";

// Add a listener to collect stats to the Spider and the QueueMananger.
// There are more components that dispatch events you can use.
$statsHandler = new StatsHandler();

// Execute crawl

// Build a report
echo "\\n  ENQUEUED:  " . count($statsHandler->getQueued());
echo "\\n  SKIPPED:   " . count($statsHandler->getFiltered());
echo "\\n  FAILED:    " . count($statsHandler->getFailed());
echo "\\n  PERSISTED:    " . count($statsHandler->getPersisted());
// Finally we could do some processing on the downloaded resources

// In this example, we will echo the title of all resources
foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) {
    echo "\\n - " . $resource->getCrawler()->filterXpath('//title')->text();

Оставить комментарий

Ваш электронный адрес не будет опубликован. Обязательные поля помечены *