用ruby给nga爬楼并保存markdown

时间：2020-08-15 17:30:25 阅读：111 评论：0 收藏：0 [点我收藏+]

这是写脚本爬取bbs.nga.cn的帖子的新手教程，可以下载图片，主要参考了github/luduox，感谢这位帅哥。

功能写法：

1、文本转码：(不加参数哈希的话，有时候ruby遇到某些字符无法转换而报错)

b = a.encode(‘utf-8‘, ‘gbk‘, {:invalid => :replace, :undef => :replace})

2、Nokogiri用法

nk = Nokogiri::XML(xml, nil, ‘utf-8‘)
# 识别用户名
user_names = {}
username_nodes = nk.xpath(‘/root/__U/item‘)
username_nodes.each do |node|
  uid = node.xpath(‘./uid‘).text # 文本
  user_name = node.xpath(‘./username‘).text
  user_names[uid] = user_name
end

3、正则匹配时string.scan(re)和re.match(string)的不同

list = text.scan(/\[quote\](.*?)\[\/quote\]/m).map{|x| x[0]}
list.each do |x|
  name = /\[uid=(.*?)\](.*?)\[\/uid\]/.match(x)[2] # 被回复的用户名
  # ...
end

说明：re.match(string)返回Matchdata类，匹配到的文本值的下标从1开始，参考Rubydoc；而string.scan(re)返回所有匹配成功的元素x组成的数组，而如果正则表达式只提取一个值，则x就是文本；如果正则表达式提取了多个值，则x是数组，里面匹配到的文本值下标正常地从0开始。（这里的用法的确让人容易搞混）

4、逻辑表达式的结果

end_page = nil 
# ... 有人会改变end_page的值吗？
end_page = (end_page || post_info[‘page_c‘])

# (nil || 1)返回1，(nil && 1)返回nil，虽然专业人士不推荐括号内多个语句，但是我觉得作为个人自用的写法还是挺方便的。
# 比如a = (puts ‘hi‘; if true then 3 else 4 end) ，最后a=3

编程思路：

1、下载图片用单独的线程处理。而如果后期考虑使程序能同时爬取多个帖子，那么这个下载图片的单独线程的代码要写好一点，同时保存图片时要指定绝对路径。

很多新手教程是不会说这些的。

2、要考虑到意外的网络情况尽量减少对总流程的干扰，而且因为这样，从网页提取数据时，分析网页的函数应该传入网页文本而不是传入网页地址后下载，因为前者方便离线调试和任务暂停后的继续执行。

全部代码如下。nga的用户cookie请自行设置

# 感谢https://github.com/ludoux/ngapost2md
# 参考以上项目，重写的nga爬楼器


require ‘open-uri‘
require ‘nokogiri‘
require ‘json‘

Nga_header = {
  #~ ‘User-agent‘ =>‘Nga_Official/80023‘,
  ‘User-agent‘ =>‘Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36‘,
  ‘Cookie‘ => ‘nga用户cookie‘
}
Saved_dir = ‘.‘


def download(link)
  begin
    io = URI.open(link, Nga_header)
    content = io.read
    io.close
    return content
  rescue OpenURI::HTTPError => err1
    puts "#{link}:#{err1.message}"
    return :content_err
  rescue StandardError => err2
    puts "#{link}:#{err2.message}"
    return :net_err
  end
end



def read_xml(src)
  # 从帖子xml文本提取各楼回帖信息
  # 转码
  src.force_encoding(‘gbk‘)
  #~ puts ‘---‘#, src.encoding
  xml = src.encode(‘utf-8‘, ‘gbk‘, {:invalid => :replace, :undef => :replace})
  nk = Nokogiri::XML(xml, nil, ‘utf-8‘)
  # 识别用户名
  user_names = {}
  username_nodes = nk.xpath(‘/root/__U/item‘)
  username_nodes.each do |node|
    uid = node.xpath(‘./uid‘).text # 文本
    user_name = node.xpath(‘./username‘).text
    user_names[uid] = user_name
  end
  #~ puts "识别用户名共#{user_names.size}个"
  reply_nodes = nk.xpath(‘/root/__R/item‘)
  #~ puts reply_nodes.size
  
  # 提取回帖信息
  replies = []
  
  reply_nodes.each do |node|
    uid = node.xpath(‘./authorid‘).text # 用户id, 兼容,可能以后会有英文pid
    score = node.xpath(‘./score‘).text.to_i # 点赞数
    post_time = node.xpath(‘./postdate‘).text # 回帖时间
    user_name = (user_names[uid] || (puts "找不到uid=#{uid}的用户名"; ‘(用户名未知)‘)) # 用户名
    post_con = node.xpath(‘./content‘).text # 回帖内容
    floor =  node.xpath(‘./lou‘).text # 楼层序号,文本
    reply_info = {
      ‘uid‘   => uid,
      ‘score‘ => score,
      ‘time‘  => post_time,
      ‘user‘  => user_name,
      ‘post‘  => post_con,
      ‘floor‘ => floor
    }
    replies << reply_info
  end

  return replies
end

def read_post(tid)
  # 提取帖子基本信息
  link = "https://bbs.nga.cn/read.php?tid=#{tid}&lite=xml"
  src = download(link)
  if src.is_a?(Symbol)
    puts "网络错误" 
    return []
  end
  src.force_encoding(‘gbk‘)
  xml = src.encode(‘utf-8‘, ‘gbk‘, {:invalid => :replace, :undef => :replace})
  nk = Nokogiri::XML(xml, nil, ‘utf-8‘)
  # 生成帖子信息
  post_info = {‘tid‘ => tid}
  reply_c = nk.xpath(‘/root/__T/replies‘).text.to_i # 回帖数
  if (reply_c == 0)
    page_c = 1
  else 
    page_c = 1 + reply_c/20 # 整除
  end
  post_info[‘reply_c‘] = reply_c 
  post_info[‘page_c‘] = page_c # 分页数
  post_info[‘subject‘] = nk.xpath(‘/root/__T/subject‘).text # 标题
  post_info[‘zone‘] = nk.xpath(‘/root/__F/name‘).text # 分区名
  return post_info
end



def pic_download
  $queue = Queue.new
  $pic_worker = Thread.new do
    loop do
      info = $queue.pop
      break if (info == nil)
      fn = info[‘fn‘]
      next if File.exist?(fn)
      r = download(info[‘link‘])
      if r.is_a?(Symbol)
        puts "下载图片#{fn}时发生网络错误"
        next
      end
      File.open(fn, ‘wb‘) do |fio|
        fio.print r
      end
    end
  end
end


def post_process(text, dir)
  # 处理回帖文本内容,比如标记替换和图片下载
  
  # 确保已跳转到指定文件夹
  
  # markdown格式
  text.gsub!(‘<br>‘, "\n")
  text.gsub!(‘<br/>‘, "\n")
  text.gsub!(‘[url]‘, ‘ ‘)
  text.gsub!(‘[/url]‘, ‘ ‘)
  text.scan(/\[del\](.*?)\[\/del\]/).each do |a|
    text.gsub!("[del]#{a[0]}[/del]", "~~#{a[0]}~~")
  end
  
  
  # AC娘表情包,简单替换为斜体
  text.scan(/\[s\:(ac|a2)\:(.*?)\]/).each do |a|
    text.gsub!("[s:#{a[0]}:#{a[1]}]", "*(#{a[1]})*")
  end
  
  # 普通图片
  img_links = text.scan(/\[img\](.*?)\[\/img\]/).map{|x| x[0]}
  img_links.each do |link|
    
    fn = link.split(‘/‘)[-1]
    
    text.gsub!("[img]#{link}[/img]", "![img](./#{fn})")
    if (link[0] == ‘.‘)
      link.delete_prefix!(‘.‘)
      link = ‘https://img.nga.178.com/attachments‘ + link
    end
    # 如果要下高清大图,取消下面几行的注释
    # link.gsub!(‘.medium.jpg‘, ‘‘)
    # link.gsub!(‘.medium.png‘, ‘‘)
    #~ puts link;exit
    
    $queue << {‘link‘ => link, ‘fn‘ => "#{dir}/#{fn}"}
  end
  
  # 引用回复
=begin
  XML示例:
  <content>[quote][pid=444847761,22951125,8]Reply[/pid] [b]Post by [uid=62014742]奇怪456[/uid] (2020-08-13 22:23):[/b]<br/><br/>这政策很好啊，我们只要看落实的过程和结果，就能知道我们到底有没有跳出历史的轮回。<br/><br/>如果还是“楚王好细腰，宫中多饿死”这一套，那我们就还在历史的周期中，兴衰成败，可以预见了。[/quote]<br/><br/>历史轮回来的太快了，别到时某连大元都不如就滚蛋了</content>
=end
  #~ if text.include?(‘[/quote]‘)
  list = text.scan(/\[quote\](.*?)\[\/quote\]/m).map{|x| x[0]}
  list.each do |x|
    name = /\[uid=(.*?)\](.*?)\[\/uid\]/.match(x)[2] # 被回复的用户名
    con = x.split(‘):[/b]‘)[1]
    con.gsub!("\n", "\n>")
    tmp = ">#{name} 说:\n>#{con}"
    text.gsub!("[quote]#{x}[/quote]", tmp)
  end
  # 楼层评论 功能未支持
  
  text.scan(/\[b\](.*?)\[\/b\]/).each do |a|
    text.gsub!("[b]#{a[0]}[/b]", "**#{a[0]}**")
  end
  
  return text
end

def write_markdown(post_info, dir)
  tid = post_info[‘tid‘]
  
  fio = File.open("#{dir}/post.md", ‘w‘)
  fio.puts "### #{post_info[‘subject‘]}", "分区:#{post_info[‘zone‘]}, 帖子id:#{tid}, 抓取时间:#{Time.now.strftime("%Y-%m-%d %H:%M:%S")}", ‘‘
  
  replies = post_info[‘replies‘]
  replies.each do |rp|
    fio.print "----- \n ##### #{rp[‘floor‘]}."
    fio.print " <#{rp[‘score‘]}>" if (rp[‘score‘] > 0)
    fio.puts " #{rp[‘user‘]} (#{rp[‘uid‘]}) @ #{rp[‘time‘]}"
    fio.puts post_process(rp[‘post‘], dir)
    fio.puts ‘‘
  end
  
  fio.close
end

def flow(tid, start_page, end_page = nil)
  # 全流程测试
  
  post_info = read_post(tid)
  replies = []
  end_page = (end_page || post_info[‘page_c‘])
  puts "下载tid=#{tid},开始页:#{start_page},结束页:#{end_page}"
  
  dir = "#{Saved_dir}/#{tid}"
  unless Dir.exist?(dir)
    Dir.mkdir(dir)
  end

  pic_download
  # 遍历页数
  (start_page..end_page).each do |page_id|
    link = "https://bbs.nga.cn/read.php?tid=#{tid}&page=#{page_id}&lite=xml"
    r = download(link)
    if r.is_a?(Symbol) then next end
    replies += read_xml(r)
    print ‘.‘
    sleep(1)
  end

  File.open("#{dir}/dump-#{tid}.json", ‘w‘) do |fio|
    fio.print post_info.to_json
  end
  puts ‘‘, "已获取#{replies.size}个回帖"
  
  post_info[‘replies‘] = replies
  puts ‘写入markdown文件,下载图片可能用时较久‘
  write_markdown(post_info, dir)
  
  $queue << nil
  sleep(1) while $pic_worker.alive?
  puts ‘流程完成.‘
end

# 命令行参数解析
# ruby run.rb 帖子tid (起始页数) 


if ($0 == __FILE__)
  tid = ARGV[0]
  a = ARGV[1].to_i
  if (a > 0) then start_page = a else start_page = 1 end

  flow(tid, start_page)
end

用ruby给nga爬楼并保存markdown

原文：https://www.cnblogs.com/uu6crella/p/13509110.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)