抓取某个网站的名称和展位号

一、代码文件(/Applications/projects/projects/php/zl.51baocuo.com/tests/crawlEx.php)

<?php
/**
 * Created by PhpStorm.
 * User: tangping
 * Date: 2019/8/28
 * Author: 442353346@qq.com
 * Time: 22:43
 */

$page       =   1;
$totalPage  =   60;
$itemPattern    =   "/\<a class=\"item\".+?\<\/a\>/i";
$patternCompany =   "/\<p class=\"name\">.+?\<\/p\>/i";
$patternNumber  =   "/\<p class=\"booth\">.+?\<\/p\>/i";
$result         =   [];

while ($page <= $totalPage){

    $url    =   'http://www.xxxxx.com/exhibitorlist/index_' . $page . '-22--.htm';

    $downContent    =   file_get_contents($url);

    echo "完成第{$page}页\n";

    $page   =   $page + 1;

    if(empty($downContent)){
        echo $url, "\n";
        continue;
    }

    if(!preg_match_all($itemPattern, $downContent, $matches)){
        echo $url, ",匹配失败。\n";
        continue;
    }

    $matches    =   $matches[0];

    foreach ($matches as $matchItem){

        $company    =   $number =   '';

        if(preg_match($patternCompany, $matchItem, $matchCompany)){
            $company    =   preg_replace('/<.+?>/', '', $matchCompany[0]);
        }

        if(preg_match($patternNumber, $matchItem, $matchNumber)){
            $number    =   preg_replace('/<.+?>/', '', $matchNumber[0]);
        }

        if(empty($company) || empty($number)){
            echo $url, ",公司名:{$company},展位号:{$number}。\n";
            continue;
        }

        $company    =   str_replace(',', ',', $company);
        $number     =   str_replace(',', ',', $number);

        array_push($result, $company . ',' . $number);

    }

}

file_put_contents('result.csv', implode($result, "\n"), FILE_APPEND);

echo "success\n";

二、终端执行:

/usr/local/php5/bin/php /Applications/projects/projects/php/zl.51baocuo.com/tests/crawlEx.php

Leave a Comment