public function insertAction() {
ini_set(‘max_execution_time‘, ‘0‘);
// error_reporting(E_ALL);
// ini_set(‘display_errors‘, ‘Off‘);
// 插入之前首先更新目录文件
$getHomeList = $this->getXmlAction();
$arr_code = array(
1 => ‘插入成功‘,
-1 => ‘插入失败!请检查再试!‘,
-2 => ‘获取xml文件失败!请检查再试!‘,
);
showApiCode($arr_code);
//把目录改成对应的ID
$getHomeList = array_combine(array_column($getHomeList, ‘name‘), array_column($getHomeList, ‘id‘));
// 添加颜色字段
$color = array(
0 => ‘#a56d57‘,
1 => ‘#4c889c‘,
2 => ‘#658965‘,
);
//连接数据库
$ArticleModel = new ArticleModel();
//创建dom对象
$dom = new DOMDocument();
//创建抓取对象
$Utils_CaptureWebContent = new Utils_CaptureWebContent(‘‘);
//加载xml.rss文件
// $xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘));
$xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘), true);
foreach ($xml as &$value) {
// 获取标题
$title = $value[‘title‘];
// 获取描述
$summary = $value[‘description‘];
//获取分类名字
$category_name = $value[‘category‘];
$send_time = strtotime($value[‘pubDate‘]);
$utime = $ctime = time();
//添加一级分类id
$article_category = $getHomeList[$category_name];
$category_color = $color[$article_category % 3];
// 测试的链接
$content_url = $value[‘link‘];
$id = sprintf("%u", crc32($content_url));
$out = $this->getDataAction($content_url);
$out = preg_replace(array(‘/<head>([\s\S]+?)<\/head>/i‘), array(‘<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head>‘), $out[‘output‘]);
@$dom->loadHTML($out);
$xpath = new DOMXPath($dom);
// 截取最后一次/后面的字符,根据这个长度来判断属于哪一个类型
$str = strlen(strrchr($content_url, ‘/‘));
$html = $Utils_CaptureWebContent->captureGet($content_url);
$html = $Utils_CaptureWebContent->formatHtml($html);
// 对网站进行分类,分三类,分别处理,获取其中的from_site,content,category_name(二级分类)
if ($str < 2) {
// 第一类(非标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/dqhjbh/jcgfffbz/
} else if ($str < 10) {
// 第二类(标准链接):http://www.gdczepb.gov.cn/detail/24441
$site = $xpath->query("//div[@class=‘cdaylist‘]/ul/li");
//获取来源地址
$from_site = $site->item(0)->nodeValue;
if (strlen(trim($from_site)) < 10) {
$from_site = ‘来源:资讯‘;
}
// 获取二级分类外面的那个div
$cate_html = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "cnav", $html);
$cate_html = preg_replace(‘/ /‘, ‘‘, $cate_html);
$category_name = substr($cate_html, strripos($cate_html, ‘>‘) + 1);
if (!$category_name) {
//如果上面获取不到,则说明二级分类在a标签里面,获取最后一个a标签里面的内容
$cate_name = $xpath->query("//div[@class=‘cnav‘]/a");
$category_name = $cate_name->item($cate_name->length - 1)->nodeValue;
}
//获取内容
$content = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "contents", $html);
} else {
// 第三类(次标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/stzl/201109/t20110919_217415.htm
//获取来源地址,没有数据,直接指定来源为科技司
$from_site = ‘来源:科学技术司‘;
$content = $Utils_CaptureWebContent->matchAllHtmlElement("table", "class", "txtnormal", $html);
$content = join($content[0], ‘‘);
$category_name = $Utils_CaptureWebContent->matchAllHtmlElement("a", "class", "dtdir12 CurrChnlCls", $html);
$category_name = $category_name[1][3];
}
//内容里面的图片也有多种src,
//第一种: upload ;
//第二种:/upload ;
//第三种:./upload/文件名;
//第四种:直接文件;
//第五种:./文件名 这种;
//正则匹配href和src
$src_pat = ‘/src="(\.?\/?upload.+?)"/‘;
$href_pat = ‘/href="(\.?\/?upload.+?)"/‘;
// 获取前缀
$host = parse_url($content_url);
$host = ‘http://‘ . $host[‘host‘] . ‘/‘;
$host_name = dirname($content_url) . ‘/‘;
$content = preg_replace(array("/style=\".+?\"/i", "/width=\".+?\"/i", "/<style([\s\S]+?)<\/style>/i", "/<script([\s\S]+?)<\/script>/i"), ‘‘, $content); //去除样式
// $content = preg_replace(array($src_pat, $href_pat), array($host . "$1", $host . "$1"), $content);
$content = preg_replace(array($src_pat, $href_pat), array(‘src="‘ . $host . "$1" . ‘"‘, ‘href="‘ . $host . "$1" . ‘"‘), $content);
$src_pat2 = ‘/src="([^http].*?)"/is‘;
$href_pat2 = ‘/href="([^http].*?)"/is‘;
// 第二次替换,把非http开头的都加上detail替换掉
$content = preg_replace(array($src_pat2, $href_pat2), array(‘src="‘ . $host_name . "$1" . ‘"‘, ‘href="‘ . $host_name . "$1" . ‘"‘), $content);
$src_one = ‘/<img[^>]*src="([^>"]*)"/is‘;
preg_match($src_one, $content, $cover_url);
$old_data = $ArticleModel->getItem($id);
$content = empty(trim($content)) ? $old_data[‘content‘] : htmlspecialchars($content);
$data_check = sprintf("%u", crc32(join(‘‘, array($title, $content))));
$params = array(
"id" => $id,
"link" => $content_url,
"article_category" => $article_category,
"title" => $title,
"summary" => $summary,
"content" => $content,
"send_time" => $send_time,
"from_site" => $from_site,
"ctime" => $ctime,
"utime" => $utime,
"category_name" => $category_name,
"category_color" => $category_color,
"cover_url" => $cover_url[1],
"data_check" => $data_check,
);
// echo "<pre>";
// print_r($params);
// echo "</pre>";
$i = 0;
if (!empty($content)) {
try {
$ArticleModel->add($params);
echo $i;
} catch (Exception $e) {
$old_check = $old_data[‘data_check‘];
if ($old_check != $data_check) {
$ArticleModel->update($params, " id = {$id} ");
$i++;
// echo ‘<h1 color="red">插入的数据与之前的不样!执行更新操作。</h1><br>‘;
}
}
} else {
continue;
}
$need = array(
‘title‘ => $title,
‘content‘ => htmlspecialchars_decode($content),
‘from_site‘ => $from_site,
‘send_time‘ => $send_time,
);
$data_test = array(
‘info‘ => $need,
);
ob_start();
$this->display("/article/infoContent.phtml", $data_test);
$id_html = ob_get_clean();
file_put_contents(PROJECT_ROOT . ‘/html/article/a‘ . $id . ‘.html‘, $id_html);
usleep(700000);
}
printf("本次更新了 %s 条数据", $i);
}
php原生态生成静态缓存,配合crontab定时刷新缓存,不需要第三方模板
原文:http://www.cnblogs.com/zhebie/p/6703058.html