首页 > Web开发 > 详细

对网站的代码采集实例

时间:2016-02-24 15:40:41      阅读:179      评论:0      收藏:0      [点我收藏+]

1.采集的网站:http://www.abnova.com/support/publication.asp

2. 相关的代码:列表(规则采集)页面使用:phpQuery.php,可以参考:PHP curl_setopt函数用法介绍中篇

3.产品详情页面:信息(不规则采集),参考代码如下:

<?php
header(‘Content-Type:text/html;charset=UTF-8‘);
include ‘phpQuery/phpQuery.php‘;
set_time_limit(0);
$id = isset($_GET[‘id‘]) ? intval($_GET[‘id‘]) : 1;

$listArr = file(‘list.txt‘);

if (!array_key_exists($id,$listArr)){
    echo ‘finished‘;
    exit;
}


$url = preg_replace(‘/[\r\n]+/‘,‘‘,$listArr[$id]);
phpQuery::newDocumentFile($url);

$artList = pq("#sub_product_info");
echo ‘<pre>‘;

foreach($artList as $li){
    $data = array();
    $datacode = array();
    $datacode = explode(‘=‘,$url);
    $data[‘code‘] = $datacode[1];
    $one = ‘‘;
    $one_a = ‘‘;
    $one_a = pq($li)->find("#10000 b")->html();
    $one_a = trim(strip_tags($one_a));
    if($one_a == ‘Product Description:‘){
        // echo 1;
     $one = pq($li)->find(‘#10000 li‘)->eq(1)->html();
     $one =  trim(strip_tags($one));
     echo "Product Description:   ".$one;   
    }else{
        echo "Product Description:   ".$one;
        // echo "wrong!<br/>";
    }
    $data[‘Description‘] = $one;
     echo ‘<br/>‘;
    #########################
    $two = ‘‘;
    $two_a = ‘‘;
    $two_a = pq($li)->find("#90000 b")->html();
    $two_a = trim(strip_tags($two_a));
    if($two_a == ‘Immunogen:‘){

     $two = pq($li)->find(‘#90000 li‘)->eq(1)->html();
     $two =  trim(strip_tags($two));
     echo "Immunogen:   ".$two;   

    }else{
        echo "Immunogen:   ".$two;
        // echo "wrong<br/>";
    }
    $data[‘Immunogen‘] = $two;
     echo ‘<br/>‘;
    #########################
    $three = ‘‘;
    $three_a = ‘‘;
    $three_a = pq($li)->find("#110000 b")->html();
    $three_a = trim(strip_tags($three_a));
    if($three_a == ‘Host:‘){

     $three = pq($li)->find(‘#110000 li‘)->eq(1)->html();
     $three =  trim(strip_tags($three));
     echo "Host:   ".$three;   

    }else{
        echo "Host:   ".$three;
        // echo "wrong<br/>";
    }
    $data[‘Host‘] = $three;
     echo ‘<br/>‘;
     #########################
     $four = ‘‘;
     $four_a = ‘‘;
    $four_a = pq($li)->find("#130000 b")->html();
    $four_a = trim(strip_tags($four_a));
    if($four_a == ‘Reactivity:‘){

     $four = pq($li)->find(‘#130000 li‘)->eq(1)->html();
     $four =  trim(strip_tags($four));
     echo "Reactivity:   ".$four;   

    }else{
        echo "Reactivity:   ".$four;
        // echo "wrong<br/>";
    }
    $data[‘Reactivity‘] = $four;
     echo ‘<br/>‘;
     #########################
    $five = ‘‘;
    $five_a =‘‘;
    $five_a = pq($li)->find("#240000 b")->html();
    $five_a = trim(strip_tags($five_a));
    if($five_a == ‘Isotype:‘){

     $five = pq($li)->find(‘#240000 li‘)->eq(1)->html();
     $five =  trim(strip_tags($five));
     echo "Isotype:   ".$five;   

    }else{
        echo "Isotype:   ".$five;
        // echo "wrong<br/>";
    }
    $data[‘Isotype‘] = $five;
     echo ‘<br/>‘;
     #########################

    $six = ‘‘;
    $six_a = ‘‘;
    $six_a = pq($li)->find("#290000 b")->html();
    $six_a = trim(strip_tags($six_a));
    if($six_a == ‘Quality Control Testing:‘){

     $six_all = pq($li)->find(‘#290000 li‘)->eq(1)->html();
     $six_all =  trim(strip_tags($six_all,"<br>"));
     // $six_all = str_replace("<br><br><br/>",‘###‘,)
     $six_arr = explode("<br><br><br>",$six_all);
     // var_dump($six_arr);
     $six = trim($six_arr[0]);   
     echo "Quality Control Testing:    ".$six;   
    
    }else{
        echo "Quality Control Testing:    ".$six;
        // echo "wrong<br/>";
    }
    $data[‘Testing‘] = $six;
     echo ‘<br/>‘;
     echo ‘<hr/>‘;
     #########################

     $wh_11 = ‘‘;
     $wh_11 = pq($li) -> find(".part")->eq(2)->find(".first_title b")->html();
     $wh_11 = trim(strip_tags($wh_11));
     echo "APP:   ".$wh_11;
     echo ‘<br/>‘;

     $wh_22 = ‘‘;
     $wh_22 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(1)->find("li")->eq(0)->html();
     $wh_22 = trim(strip_tags($wh_22));
     echo "Western:   ".$wh_22;
     echo ‘<br/>‘;

     $wh_33 = ‘‘;
     $wh_33 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(2)->find("li")->eq(0)->html();
     $wh_33 = trim(strip_tags($wh_33));
     echo "Western Blot:   ".$wh_33;
     echo ‘<br/>‘;

     $wh_44 = ‘‘;
     $wh_44 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(3)->find("li")->eq(0)->html();
     $wh_44 = trim(strip_tags($wh_44));
     echo "Immunohistochemistry:    ".$wh_44;
     echo ‘<br/>‘;

     $wh_55 = ‘‘;
     $wh_55 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(4)->find("li")->eq(0)->html();
     $wh_55 = trim(strip_tags($wh_55));
     echo "Immunofluorescence:    ".$wh_55;
     echo ‘<br/>‘;

     $wh_66 = ‘‘;
     $wh_66 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(5)->find("li")->eq(0)->html();
     $wh_66 = trim(strip_tags($wh_66));
     echo "Sandwich ELISA:   ".$wh_66;
     echo ‘<br/>‘;

     $wh_77 = ‘‘;
     $wh_77 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(6)->find("li")->eq(0)->html();
     $wh_77 = trim(strip_tags($wh_77));
     echo "ELISA:   ".$wh_77;
     echo ‘<br/>‘;
    $app = array();
    $appstr = ‘‘;
    if ($wh_22 != ‘‘) $app[‘w1‘] = $wh_22;
    if ($wh_33 != ‘‘) $app[‘w2‘] = $wh_33;
    if ($wh_44 != ‘‘) $app[‘w3‘] = $wh_44;
    if ($wh_55 != ‘‘) $app[‘w4‘] = $wh_55;
    if ($wh_66 != ‘‘) $app[‘w5‘] = $wh_66;
    if ($wh_77 != ‘‘) $app[‘w6‘] = $wh_77;
    echo $appstr = implode(‘,‘,$app);
    $data[‘app‘] = $appstr;
     echo ‘<hr/>‘;
     #########################
     // $length = pq($li) -> find(".part")->eq(3)->find("ul")->find("li")->html();
     // $length = trim(strip_tags($length,‘<b>‘));
     // $length = str_replace("<b>","####",$length);
     // // $length = str_replace("</b>",",",$length);
     // echo $length;
     // $arr = explode(",",$length);
     // var_dump($arr);

     // foreach($length as $list){
     //  echo $list;
     //  // exit;
     // }
      echo ‘<hr/>‘;
     echo ‘<br/>‘;

     $heng_11 = ‘‘;
     $heng_11_a = ‘‘;
     $heng_11_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(0)->find("b")->html();
     $heng_11_a = trim(strip_tags($heng_11_a));

     if($heng_11_a == ‘Entrez GeneID:‘){
      $heng_11 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(1)->find("a")->html();
      $heng_11 = trim(strip_tags($heng_11));
      echo "Entrez GeneID:   ".$heng_11;
     }else{
         echo "Entrez GeneID:   ".$heng_11;
     }
    $data[‘GeneID‘] = $heng_11;
     echo ‘<br/>‘;

     $heng_22 = ‘‘;
     $heng_22_a = ‘‘;
     $heng_22_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(0)->find("b")->html();
     $heng_22_a = trim(strip_tags($heng_22_a));

     if($heng_22_a == ‘GeneBank Accession#:‘){
     $heng_22 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(1)->find("a")->html();
     $heng_22 = trim(strip_tags($heng_22));
     echo "GeneBank Accession#:   ".$heng_22;
     }else{
         echo "GeneBank Accession#:   ".$heng_22;
     }
     $data[‘GeneBank Accession‘] = $heng_22;
     echo ‘<br/>‘;

     $heng_33 = ‘‘;
     $heng_33_a = ‘‘;
     $heng_33_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(0)->find("b")->html();
     $heng_33_a = trim(strip_tags($heng_33_a));

     if($heng_33_a == ‘Protein Accession#:‘){
     $heng_33 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(1)->find("a")->html();
     $heng_33 = trim(strip_tags($heng_33));
     echo "Protein Accession#:   ".$heng_33;
     }else{
         echo "Protein Accession#:   ".$heng_33;
     }
     $data[‘Protein Accession‘] = $heng_33;
     echo ‘<br/>‘;

      $heng_44 = ‘‘;
      $heng_44_a = ‘‘;
      $heng_44_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(0)->find("b")->html();
      $heng_44_a = trim(strip_tags($heng_44_a));
      if($heng_44_a == ‘Gene Name:‘){
        $heng_44 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(1)->html();
         $heng_44 = trim(strip_tags($heng_44));
         echo "Gene Name:   ".$heng_44;
      }else{
          echo "Gene Name:    ".$heng_44;
      }
    $data[‘Gene Name‘] = $heng_44;
     echo ‘<br/>‘;

      $heng_55 = ‘‘;
      $heng_55_a = ‘‘;
      $heng_55_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(0)->find("b")->html();
      $heng_55_a = trim(strip_tags($heng_55_a));

      if($heng_55_a == ‘Gene Alias:‘){
        $heng_55 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(1)->html();
        $heng_55 = trim(strip_tags($heng_55));
        echo "Gene Alias:   ".$heng_55;
      }else{
          echo "Gene Alias:  ".$heng_55;
      }
$data[‘Gene Alias‘] = $heng_55;
     
     echo ‘<br/>‘;

      $heng_66 = ‘‘;
      $heng_66_a = ‘‘;
      $heng_66_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(0)->find("b")->html();
      $heng_66_a = trim(strip_tags($heng_66_a));
      if($heng_66_a == ‘Omim ID:‘){
        $heng_66 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(1)->html();
        $heng_66 = trim(strip_tags($heng_66));
        echo "Omim ID:  ".$heng_66;
      }else{
          echo "Omim ID:  ".$heng_66;
      }
$data[‘Omim ID‘] = $heng_66;
     echo ‘<br/>‘;

      $heng_77 = ‘‘;
      $heng_77_a = ‘‘;
      $heng_77_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(0)->find("b")->html();
      $heng_77_a = trim(strip_tags($heng_77_a));
      if($heng_77_a == ‘Gene Ontology:‘){
        $heng_77 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(1)->find("a")->html();
        $heng_77 = trim(strip_tags($heng_77));
        echo "Gene Ontology:   ".$heng_77;
      }else{
          echo "Gene Ontology:   ".$heng_77;
      }
    $data[‘Gene Ontology‘] = $heng_77;
     
     
    
     # 获取文献
     $rarr = array();
     $Reference = ‘‘;
     if (preg_match(‘/Publication Reference/‘,$li->textContent)){
         preg_match_all(‘/Publication Reference(.*?)Applications/‘,preg_replace(‘/[\r\n]+/‘,‘‘,$li->textContent),$rarr);
     }
    $Reference = $rarr[1][0];
    $data[‘Reference‘] = $Reference;
    
    $rarr = array();
    print_r($data);
    
    # 写入文件 
    $handle = fopen(‘list-new.csv‘,‘a‘);
    fputcsv($handle,$data);
    fclose($handle);
    
}

?>
<script>
function JumpUrl(){
    location.href=‘?id=<?php echo ($id+1);?>‘;
}
setTimeout(‘JumpUrl()‘,0);
</script>

对网站的代码采集实例

原文:http://www.cnblogs.com/wuheng1991/p/5213089.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!