1.采集的网站:http://www.abnova.com/support/publication.asp
2. 相关的代码:列表(规则采集)页面使用:phpQuery.php,可以参考:PHP curl_setopt函数用法介绍中篇
3.产品详情页面:信息(不规则采集),参考代码如下:
<?php header(‘Content-Type:text/html;charset=UTF-8‘); include ‘phpQuery/phpQuery.php‘; set_time_limit(0); $id = isset($_GET[‘id‘]) ? intval($_GET[‘id‘]) : 1; $listArr = file(‘list.txt‘); if (!array_key_exists($id,$listArr)){ echo ‘finished‘; exit; } $url = preg_replace(‘/[\r\n]+/‘,‘‘,$listArr[$id]); phpQuery::newDocumentFile($url); $artList = pq("#sub_product_info"); echo ‘<pre>‘; foreach($artList as $li){ $data = array(); $datacode = array(); $datacode = explode(‘=‘,$url); $data[‘code‘] = $datacode[1]; $one = ‘‘; $one_a = ‘‘; $one_a = pq($li)->find("#10000 b")->html(); $one_a = trim(strip_tags($one_a)); if($one_a == ‘Product Description:‘){ // echo 1; $one = pq($li)->find(‘#10000 li‘)->eq(1)->html(); $one = trim(strip_tags($one)); echo "Product Description: ".$one; }else{ echo "Product Description: ".$one; // echo "wrong!<br/>"; } $data[‘Description‘] = $one; echo ‘<br/>‘; ######################### $two = ‘‘; $two_a = ‘‘; $two_a = pq($li)->find("#90000 b")->html(); $two_a = trim(strip_tags($two_a)); if($two_a == ‘Immunogen:‘){ $two = pq($li)->find(‘#90000 li‘)->eq(1)->html(); $two = trim(strip_tags($two)); echo "Immunogen: ".$two; }else{ echo "Immunogen: ".$two; // echo "wrong<br/>"; } $data[‘Immunogen‘] = $two; echo ‘<br/>‘; ######################### $three = ‘‘; $three_a = ‘‘; $three_a = pq($li)->find("#110000 b")->html(); $three_a = trim(strip_tags($three_a)); if($three_a == ‘Host:‘){ $three = pq($li)->find(‘#110000 li‘)->eq(1)->html(); $three = trim(strip_tags($three)); echo "Host: ".$three; }else{ echo "Host: ".$three; // echo "wrong<br/>"; } $data[‘Host‘] = $three; echo ‘<br/>‘; ######################### $four = ‘‘; $four_a = ‘‘; $four_a = pq($li)->find("#130000 b")->html(); $four_a = trim(strip_tags($four_a)); if($four_a == ‘Reactivity:‘){ $four = pq($li)->find(‘#130000 li‘)->eq(1)->html(); $four = trim(strip_tags($four)); echo "Reactivity: ".$four; }else{ echo "Reactivity: ".$four; // echo "wrong<br/>"; } $data[‘Reactivity‘] = $four; echo ‘<br/>‘; ######################### $five = ‘‘; $five_a =‘‘; $five_a = pq($li)->find("#240000 b")->html(); $five_a = trim(strip_tags($five_a)); if($five_a == ‘Isotype:‘){ $five = pq($li)->find(‘#240000 li‘)->eq(1)->html(); $five = trim(strip_tags($five)); echo "Isotype: ".$five; }else{ echo "Isotype: ".$five; // echo "wrong<br/>"; } $data[‘Isotype‘] = $five; echo ‘<br/>‘; ######################### $six = ‘‘; $six_a = ‘‘; $six_a = pq($li)->find("#290000 b")->html(); $six_a = trim(strip_tags($six_a)); if($six_a == ‘Quality Control Testing:‘){ $six_all = pq($li)->find(‘#290000 li‘)->eq(1)->html(); $six_all = trim(strip_tags($six_all,"<br>")); // $six_all = str_replace("<br><br><br/>",‘###‘,) $six_arr = explode("<br><br><br>",$six_all); // var_dump($six_arr); $six = trim($six_arr[0]); echo "Quality Control Testing: ".$six; }else{ echo "Quality Control Testing: ".$six; // echo "wrong<br/>"; } $data[‘Testing‘] = $six; echo ‘<br/>‘; echo ‘<hr/>‘; ######################### $wh_11 = ‘‘; $wh_11 = pq($li) -> find(".part")->eq(2)->find(".first_title b")->html(); $wh_11 = trim(strip_tags($wh_11)); echo "APP: ".$wh_11; echo ‘<br/>‘; $wh_22 = ‘‘; $wh_22 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(1)->find("li")->eq(0)->html(); $wh_22 = trim(strip_tags($wh_22)); echo "Western: ".$wh_22; echo ‘<br/>‘; $wh_33 = ‘‘; $wh_33 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(2)->find("li")->eq(0)->html(); $wh_33 = trim(strip_tags($wh_33)); echo "Western Blot: ".$wh_33; echo ‘<br/>‘; $wh_44 = ‘‘; $wh_44 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(3)->find("li")->eq(0)->html(); $wh_44 = trim(strip_tags($wh_44)); echo "Immunohistochemistry: ".$wh_44; echo ‘<br/>‘; $wh_55 = ‘‘; $wh_55 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(4)->find("li")->eq(0)->html(); $wh_55 = trim(strip_tags($wh_55)); echo "Immunofluorescence: ".$wh_55; echo ‘<br/>‘; $wh_66 = ‘‘; $wh_66 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(5)->find("li")->eq(0)->html(); $wh_66 = trim(strip_tags($wh_66)); echo "Sandwich ELISA: ".$wh_66; echo ‘<br/>‘; $wh_77 = ‘‘; $wh_77 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(6)->find("li")->eq(0)->html(); $wh_77 = trim(strip_tags($wh_77)); echo "ELISA: ".$wh_77; echo ‘<br/>‘; $app = array(); $appstr = ‘‘; if ($wh_22 != ‘‘) $app[‘w1‘] = $wh_22; if ($wh_33 != ‘‘) $app[‘w2‘] = $wh_33; if ($wh_44 != ‘‘) $app[‘w3‘] = $wh_44; if ($wh_55 != ‘‘) $app[‘w4‘] = $wh_55; if ($wh_66 != ‘‘) $app[‘w5‘] = $wh_66; if ($wh_77 != ‘‘) $app[‘w6‘] = $wh_77; echo $appstr = implode(‘,‘,$app); $data[‘app‘] = $appstr; echo ‘<hr/>‘; ######################### // $length = pq($li) -> find(".part")->eq(3)->find("ul")->find("li")->html(); // $length = trim(strip_tags($length,‘<b>‘)); // $length = str_replace("<b>","####",$length); // // $length = str_replace("</b>",",",$length); // echo $length; // $arr = explode(",",$length); // var_dump($arr); // foreach($length as $list){ // echo $list; // // exit; // } echo ‘<hr/>‘; echo ‘<br/>‘; $heng_11 = ‘‘; $heng_11_a = ‘‘; $heng_11_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(0)->find("b")->html(); $heng_11_a = trim(strip_tags($heng_11_a)); if($heng_11_a == ‘Entrez GeneID:‘){ $heng_11 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(1)->find("a")->html(); $heng_11 = trim(strip_tags($heng_11)); echo "Entrez GeneID: ".$heng_11; }else{ echo "Entrez GeneID: ".$heng_11; } $data[‘GeneID‘] = $heng_11; echo ‘<br/>‘; $heng_22 = ‘‘; $heng_22_a = ‘‘; $heng_22_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(0)->find("b")->html(); $heng_22_a = trim(strip_tags($heng_22_a)); if($heng_22_a == ‘GeneBank Accession#:‘){ $heng_22 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(1)->find("a")->html(); $heng_22 = trim(strip_tags($heng_22)); echo "GeneBank Accession#: ".$heng_22; }else{ echo "GeneBank Accession#: ".$heng_22; } $data[‘GeneBank Accession‘] = $heng_22; echo ‘<br/>‘; $heng_33 = ‘‘; $heng_33_a = ‘‘; $heng_33_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(0)->find("b")->html(); $heng_33_a = trim(strip_tags($heng_33_a)); if($heng_33_a == ‘Protein Accession#:‘){ $heng_33 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(1)->find("a")->html(); $heng_33 = trim(strip_tags($heng_33)); echo "Protein Accession#: ".$heng_33; }else{ echo "Protein Accession#: ".$heng_33; } $data[‘Protein Accession‘] = $heng_33; echo ‘<br/>‘; $heng_44 = ‘‘; $heng_44_a = ‘‘; $heng_44_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(0)->find("b")->html(); $heng_44_a = trim(strip_tags($heng_44_a)); if($heng_44_a == ‘Gene Name:‘){ $heng_44 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(1)->html(); $heng_44 = trim(strip_tags($heng_44)); echo "Gene Name: ".$heng_44; }else{ echo "Gene Name: ".$heng_44; } $data[‘Gene Name‘] = $heng_44; echo ‘<br/>‘; $heng_55 = ‘‘; $heng_55_a = ‘‘; $heng_55_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(0)->find("b")->html(); $heng_55_a = trim(strip_tags($heng_55_a)); if($heng_55_a == ‘Gene Alias:‘){ $heng_55 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(1)->html(); $heng_55 = trim(strip_tags($heng_55)); echo "Gene Alias: ".$heng_55; }else{ echo "Gene Alias: ".$heng_55; } $data[‘Gene Alias‘] = $heng_55; echo ‘<br/>‘; $heng_66 = ‘‘; $heng_66_a = ‘‘; $heng_66_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(0)->find("b")->html(); $heng_66_a = trim(strip_tags($heng_66_a)); if($heng_66_a == ‘Omim ID:‘){ $heng_66 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(1)->html(); $heng_66 = trim(strip_tags($heng_66)); echo "Omim ID: ".$heng_66; }else{ echo "Omim ID: ".$heng_66; } $data[‘Omim ID‘] = $heng_66; echo ‘<br/>‘; $heng_77 = ‘‘; $heng_77_a = ‘‘; $heng_77_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(0)->find("b")->html(); $heng_77_a = trim(strip_tags($heng_77_a)); if($heng_77_a == ‘Gene Ontology:‘){ $heng_77 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(1)->find("a")->html(); $heng_77 = trim(strip_tags($heng_77)); echo "Gene Ontology: ".$heng_77; }else{ echo "Gene Ontology: ".$heng_77; } $data[‘Gene Ontology‘] = $heng_77; # 获取文献 $rarr = array(); $Reference = ‘‘; if (preg_match(‘/Publication Reference/‘,$li->textContent)){ preg_match_all(‘/Publication Reference(.*?)Applications/‘,preg_replace(‘/[\r\n]+/‘,‘‘,$li->textContent),$rarr); } $Reference = $rarr[1][0]; $data[‘Reference‘] = $Reference; $rarr = array(); print_r($data); # 写入文件 $handle = fopen(‘list-new.csv‘,‘a‘); fputcsv($handle,$data); fclose($handle); } ?> <script> function JumpUrl(){ location.href=‘?id=<?php echo ($id+1);?>‘; } setTimeout(‘JumpUrl()‘,0); </script>
原文:http://www.cnblogs.com/wuheng1991/p/5213089.html