function (file, ignoreBlanks = TRUE, handlers = NULL, replaceEntities = FALSE,
asText = FALSE, trim = TRUE, validate = FALSE, getDTD = TRUE,
isURL = FALSE, asTree = FALSE, addAttributeNamespaces = FALSE,
useInternalNodes = FALSE, isSchema = FALSE, fullNamespaceInfo = FALSE,
encoding = character(), useDotNames = length(grep("^\\.",
names(handlers)))>0, xinclude = TRUE, addFinalizer = TRUE,
error = htmlErrorHandler, isHTML = TRUE, options = integer(),
parentFirst = FALSE)
{
#asText=T则file作为XML文本处理
isMissingAsText = missing(asText)
#地址大于一个时要中止程序并抛出异常
if(length(file)>1){
file = paste(file, collapse ="\n")
if(!missing(asText)&&!asText)
stop(structure(list(message ="multiple URLs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE"),
class= c("MultipleURLError","XMLParserError",
"simpleError","error","condition")))
asText = TRUE
}
#当isURL非空时且是XML的时候,才修改URL为数值型
#比如这里isURL=1
if(missing(isURL)&&!asText)
isURL <- length(grep("^(http|ftp|file)://", file, useBytes = TRUE,
perl = TRUE))
#isHTML 默认为 TRUE
if(isHTML){
validate = FALSE
getDTD = FALSE
isSchema = FALSE
docClass ="HTMLInternalDocument"
}
else docClass = character()
#checkHandlerNames返回的是一个逻辑值,其作用是
checkHandlerNames(handlers,"DOM")
if(missing(fullNamespaceInfo)&& inherits(handlers,"RequiresNamespaceInfo"))
fullNamespaceInfo = TRUE
oldValidate = xmlValidity()
xmlValidity(validate)
on.exit(xmlValidity(oldValidate))
if(!asText && isURL == FALSE){
if(file.exists(file)== FALSE)
if(!missing(asText)&& asText == FALSE){
e = simpleError(paste("File", file,"does not exist"))
class(e)= c("FileNotFound",class(e))
stop(e)
}
else asText <- TRUE
}
if(asText && length(file)>1)
file = paste(file, collapse ="\n")
old = setEntitySubstitution(replaceEntities)
on.exit(setEntitySubstitution(old), add = TRUE)
if(asText && length(grep(sprintf("^%s?\\s*<",BOMRegExp),
file, perl = TRUE, useBytes = TRUE))==0){
if(!isHTML ||(isMissingAsText &&!inherits(file,"AsIs"))){
e = simpleError(paste("XML content does not seem to be XML:",
sQuote(file)))
class(e)= c("XMLInputError",class(e))
(if(isHTML)
warning
else stop)(e)
}
}
if(!is.logical(xinclude)){
xinclude =as.logical(xinclude)
}
if(!asText &&!isURL)
file = path.expand(as.character(file))
if(useInternalNodes && trim){
prevBlanks =.Call("RS_XML_setKeepBlanksDefault",0L,
PACKAGE ="XML")
on.exit(.Call("RS_XML_setKeepBlanksDefault", prevBlanks,
PACKAGE ="XML"), add = TRUE)
}
.oldErrorHandler = setXMLErrorHandler(error)
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
if(length(options))
options = sum(options)
ans <-.Call("RS_XML_ParseTree",as.character(file), handlers,
as.logical(ignoreBlanks),as.logical(replaceEntities),
as.logical(asText),as.logical(trim),as.logical(validate),
as.logical(getDTD),as.logical(isURL),as.logical(addAttributeNamespaces),
as.logical(useInternalNodes),as.logical(isHTML),as.logical(isSchema),
as.logical(fullNamespaceInfo),as.character(encoding),
as.logical(useDotNames), xinclude, error, addFinalizer,
as.integer(options),as.logical(parentFirst), PACKAGE ="XML")
if(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
return(handlers)
if(!isSchema && length(class(ans)))
class(ans)= c(docClass, oldClass(class(ans)))
if(inherits(ans,"XMLInternalDocument"))
addDocFinalizer(ans, addFinalizer)
elseif(!getDTD &&!isSchema){
class(ans)= oldClass("XMLDocumentContent")
}
ans
}
<environment: namespace:XML>
#test
#导包
library(XML)
#链接
url <-"http://www.r-datacollection.com/materials/html/fortunes.html"
#handers函数
h2 <- list(
startElement = function(node,...){
name <- xmlName(node)
if(name %in% c("div","title")){NULL}else{node}
},
comment = function(node){NULL}
)
#正式开始,函数参数
file=url
ignoreBlanks = TRUE
handlers = h2
replaceEntities = FALSE
asText = FALSE
trim = TRUE
validate = FALSE
getDTD = TRUE
isURL = FALSE
asTree = TRUE
addAttributeNamespaces = FALSE
useInternalNodes = FALSE
isSchema = FALSE
fullNamespaceInfo = FALSE
encoding = character()
useDotNames = length(grep("^\\.", names(handlers)))>0
xinclude = TRUE
addFinalizer = TRUE
error = XML:::htmlErrorHandler
isHTML = TRUE
options = integer()
parentFirst = FALSE
#函数体部分
#asText参数没有传入
#我们将其从missing(asText)改为TRUE
isMissingAsText = TRUE
#file的长度是否大于1,大于1如果asText未传入则要报错:传入了多个URL
#不大于1,则跳过
if(length(file)>1){
file = paste(file, collapse ="\n")
if(!missing(asText)&&!asText)
stop(structure(list(message ="multiple URLs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE"),
class= c("MultipleURLError","XMLParserError",
"simpleError","error","condition")))
asText = TRUE
}
# 本来是if (missing(isURL) && !asText)
# 我们修改为if (TURE && !asText)
#isURL参数没有传递且asText参数为假才执行
if(TRUE &&!asText)
isURL <- length(grep("^(http|ftp|file)://", file, useBytes = TRUE,
perl = TRUE))
#只要有http|ftp|file中的一个协议开头,比如http://,就是URL了。
# 此时isURL=1
#是否为HTML,是的
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}else{docClass = character()}#否则创建空的
# class(docClass)
#返回TRUE通过检验了,否则函数会中止并抛出异常
XML:::checkHandlerNames(handlers,"DOM")
#fullNamespaceInfo参数whether to provide the namespace URI and prefix on each node
#其实就是是否在节点面前带上URI信息
#fullNamespaceInfo为空且handlers含有该属性,才执行这一步
#missing(fullNamespaceInfo)被替换为TRUE
#handlers并没有RequiresNamespaceInfo属性,所以不执行
if( TRUE && inherits(handlers,"RequiresNamespaceInfo"))
fullNamespaceInfo = TRUE
#以下两行的结果都是integer(0)
#奇妙~,先保存原有的配置
oldValidate = XML:::xmlValidity()
#当前的配置
XML:::xmlValidity(validate)
#还原原来的配置
on.exit(XML:::xmlValidity(oldValidate))
#asText为假,且isURL是假的
#结果为TFALSE,我们不必管他
if(!asText && isURL == FALSE){
if(file.exists(file)== FALSE)
#如果本地文件不存在
if(!missing(asText)&& asText == FALSE){
#抛出异常,文件不存在
e = simpleError(paste("File", file,"does not exist"))
class(e)= c("FileNotFound",class(e))
stop(e)
}
else asText <- TRUE
}
#此时asText是FALSE,这个跟我们无关
if(asText && length(file)>1)
file = paste(file, collapse ="\n")
#replaceEntities的默认值是FALSE
old = XML:::setEntitySubstitution(replaceEntities)
#old的值是FALSE
on.exit(XML:::setEntitySubstitution(old), add = TRUE)
#BOMRegExp是一个内置的常量吧
#看名字应该是基于BOM的正则表达式
#因为是FALSE,所以我们也先不去管它
if(asText && length(grep(
sprintf("^%s?\\s*<", XML:::BOMRegExp),
file, perl = TRUE, useBytes = TRUE
)
)==0)
{
if(!isHTML ||(isMissingAsText &&!inherits(file,"AsIs"))){
e = simpleError(paste("XML content does not seem to be XML:",
sQuote(file)))
class(e)= c("XMLInputError",class(e))
(if(isHTML)
warning
else stop)(e)
}
}
#xinclude默认值是TRUE
#以下三个if都是F,所以不管了
if(!is.logical(xinclude)){
xinclude =as.logical(xinclude)
}
if(!asText &&!isURL)
file = path.expand(as.character(file))
if(useInternalNodes && trim){
prevBlanks =.Call("RS_XML_setKeepBlanksDefault",0L,
PACKAGE ="XML")
on.exit(.Call("RS_XML_setKeepBlanksDefault", prevBlanks,
PACKAGE ="XML"), add = TRUE)
}
.oldErrorHandler = XML:::setXMLErrorHandler(error)
#所以,这种点开头的命名是什么鬼?
# class(.oldErrorHandler)
# [1] "list"
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
# length(options)是0,所以不执行
if(length(options))
options = sum(options)
#调用一个叫做RS_XML_ParseTree的函数
getAnywhere("RS_XML_ParseTree")
ans <-.Call("RS_XML_ParseTree",as.character(file), handlers,
as.logical(ignoreBlanks),as.logical(replaceEntities),
as.logical(asText),as.logical(trim),as.logical(validate),
as.logical(getDTD),as.logical(isURL),as.logical(addAttributeNamespaces),
as.logical(useInternalNodes),as.logical(isHTML),as.logical(isSchema),
as.logical(fullNamespaceInfo),as.character(encoding),
as.logical(useDotNames), xinclude, error, addFinalizer,
as.integer(options),as.logical(parentFirst), PACKAGE ="XML")
print(ans)
print("-------我是可爱的分割线------------------")
#这里的missing(handlers)我们就不改了哈
#毕竟只要有默认值,他都觉得是TRUE
#和我们的确传递了处理函数的效果TRUE是一样的
if(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
return("呵呵")
if(!isSchema && length(class(ans)))
class(ans)= c(docClass, oldClass(class(ans)))
if(inherits(ans,"XMLInternalDocument")){
addDocFinalizer(ans, addFinalizer)
}elseif(!getDTD &&!isSchema){
print("看我的类型")
print(class(ans))
class(ans)= oldClass("XMLDocumentContent")
print(class(ans))
}
print("看条件判断的逻辑值")
print("1")
print(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
print("2")
print(!isSchema && length(class(ans)))
print("3")
print(inherits(ans,"XMLInternalDocument"))
print("4")
print(!getDTD &&!isSchema)
print(class(ans))
ans
#例子1
testMissing<-function(a=TRUE,b=FALSE){
if(missing(b))
return("b is missing")
else"b is here "+b
}
testMissing(F)
# [1] "b is missing"
#例子2
if(missing(b))
return("b is missing")
# Error in missing(b) : ‘missing‘ can only be used for arguments
#例子3
b=NULL
if(missing(b))
return("b is missing")
#没有输出
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}else{docClass = character()}#否则创建空的
}else docClass = character()
function(){
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}
else docClass = character()#否则创建空的
}
function (handlers, id ="SAX")
{
if(is.null(handlers)) #为空,则返回TRUE
return(TRUE)
ids = names(handlers) #取出handlers中的函数名
i = match(ids,GeneralHandlerNames) #匹配,返回逻辑值向量
prob = any(!is.na(i))#任一个回空才是TRUE
if(prob){
warning("future versions of the XML package will require names of general handler functions
to be prefixed by a . to distinguish them from handlers for nodes with those names. This _may_ affect the ",
paste(names(handlers)[!is.na(i)], collapse =", "))
}
#任意一个handler中的函数不是函数类型,则抛出异常
if(any(w <-!sapply(handlers,is.function)))
warning("some handlers are not functions: ", paste(names(handlers[w]),
collapse =", "))
#返回TRUE,后续代码继续运行
!prob
}
<environment: namespace:XML>
> XML:::GeneralHandlerNames
$SAX
[1]"text" "startElement"
[3]"endElement" "comment"
[5]"startDocument" "endDocument"
[7]"processingInstruction""entityDeclaration"
[9]"externalEntity"
$DOM
[1]"text" "startElement"
[3]"comment" "entity"
[5]"cdata" "processingInstruction"
> h2 <- list(
+ startElement = function(node,...){
+ name <- xmlName(node)
+ if(name %in% c("div","title")){NULL}else{node}
+ },
+ comment = function(node){NULL}
+)
> handlers<-h2
> ids = names(handlers)
> ids
[1]"startElement""comment"
> i = match(ids, XML:::GeneralHandlerNames)
> i
[1] NA NA
>?match
starting httpd help server ... done
>!is.na(i)#如果i中有NA,则返回F,没有NA则返回T
[1] FALSE FALSE
> prob = any(!is.na(i))#当i中的任意一个元素都不是NA的时候,prob才返回T
> prob
[1] FALSE
> testMatch<-c("a","c")
> testSet <- c(‘a‘,‘b‘,‘c‘)
> match(testMatch,testSet)
[1]13
x <-10
class(x)# "numeric"
oldClass(x)# NULL
#看了下文档,我个人觉得oldClass是S语言的余毒啊!!!
inherits(x,"a")#FALSE
class(x)<- c("a","b")
# x
# [1] 10
# attr(,"class")
# [1] "a" "b"
#即,x为10这个变量被赋予了两个class,分别名为"a"和 "b"
inherits(x,"a")#TRUE
inherits(x,"a", TRUE)# 1
inherits(x, c("a","b","c"), TRUE)# 1 2 0
#以下两行的结果都是integer(0)
#奇妙~,先保存原有的配置
oldValidate = XML:::xmlValidity()
#使用当前的配置
XML:::xmlValidity(validate)
#还原
on.exit(XML:::xmlValidity(oldValidate))
> XML:::xmlValidity()
integer(0)
> getAnywhere(xmlValidity)
A single object matching ‘xmlValidity’ was found
It was found in the following places
namespace:XML
with value
function (val = integer(0))
{
.Call("RS_XML_getDefaultValiditySetting",as.integer(val),
PACKAGE ="XML")
}
<environment: namespace:XML>
> opar <- par(bg=‘lightblue‘)
> on.exit(par(opar))
> plot(c(1,2,3),c(4,5,6))#蓝色背景
> plot(c(1,2,3,4,5),runif(5))#蓝色背景
#此时关闭绘图窗口
> plot(c(1,2,3,4,5),rnorm(5))
#白色背景
plot_with_big_margins <- function(...)
{
old_pars <- par(mar = c(10,9,9,7))
on.exit(par(old_pars))
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
#不关闭图像窗口,此时再运行如下语句
plot(c(1,2,3),c(4,5,6))
对比
plot_with_big_margins <- function(...)
{
par(mar = c(10,9,9,7))
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
#不关闭图像窗口,此时再运行:
plot(c(1,2,3),c(4,5,6))
old_pars <- par(mar = c(10,9,9,7))
on.exit(par(old_pars))
> old_pars <- par(mar = c(10,9,9,7))
> old_pars
$mar
[1]5.14.14.12.1
> op <- options(stringsAsFactors = FALSE)
> op
$stringsAsFactors
[1] TRUE
par() #得到的
mar是5.14.14.12.1plot_with_big_margins <- function(...)
{
old_pars <- par(mar = c(10,9,9,7)) #原参数被保存,新参数设置生效
print(par())
#得到的
mar是10,9,9,7
on.exit(par(old_pars)) #参数被还原为原来的参数
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
par() #得到的
mar是5.14.14.12.1
my_plot <- function()
{
with(cars, plot(speed, dist))
}
save_base_plot <- function(plot_fn, file)
{
png(file)
on.exit(dev.off())
plot_fn()
}
save_base_plot(my_plot,"testcars.png")
#replaceEntities的默认值是FALSE
old = XML:::setEntitySubstitution(replaceEntities)
#old的值是FALSE
on.exit(XML:::setEntitySubstitution(old), add = TRUE)
> XML:::setEntitySubstitution
function (val)
.Call("RS_XML_SubstituteEntitiesDefault",as.logical(val), PACKAGE ="XML")
<environment: namespace:XML>
.oldErrorHandler = XML:::setXMLErrorHandler(error)
#所以,这种点开头的命名是什么鬼?
# class(.oldErrorHandler)
# [1] "list"
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
> XML:::htmlErrorHandler
function (msg, code, domain, line, col, level, filename,class="XMLError")
{
e = makeXMLError(msg, code, domain, line, col, level, filename,
class)
dom = names(e$domain)
class(e)= c(names(e$code), sprintf("%s_Error", gsub("_FROM_",
"_", dom)),class(e))
if(e$code == xmlParserErrors["XML_IO_LOAD_ERROR"])
stop(e)
}
<environment: namespace:XML>
> XML:::setXMLErrorHandler
function (fun)
{
prev =.Call("RS_XML_getStructuredErrorHandler", PACKAGE ="XML")
sym = getNativeSymbolInfo("R_xmlStructuredErrorHandler",
"XML")$address
.Call("RS_XML_setStructuredErrorHandler", list(fun, sym),
PACKAGE ="XML")
prev
}
<environment: namespace:XML>
> getAnywhere(BOMRegExp)
A single object matching ‘BOMRegExp’ was found
It was found in the following places
namespace:XML
with value
[1]"(\\xEF\\xBB\\xBF|\\xFE\\xFF|\\xFF\\xFE)"
untar(download.packages(pkgs ="XML",
destdir =".",
type ="source")[,2])
R自动数据收集第二章HTML笔记2(主要关于htmlTreeParse函数)
原文:http://www.cnblogs.com/xuanlvshu/p/6238408.html