记录一下第一次用R语言爬Letpub期刊信息的代码,以后还是尽量用python吧。
library(XML)
library(RSelenium)
library(rvest)
library(stringr)
library(data.table)
library(magrittr)
library(xlsx)
letpub <- function(){
# Do not read string as a factor!
options(stringsAsFactors=F)
# load selenium service
system(\"java -jar E:/study/python/selenium-server-standalone-2.50.1.jar\", wait=F, invisible=T, minimized=T)
# Configure the browser
eCap <- list(phantomjs.page.settings.userAgent=\"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0\")
# remDr <- remoteDriver(browserName=\"chrome\", extraCapabilities=eCap)
remDr <- remoteDriver(browserName=\"phantomjs\", extraCapabilities=eCap)
# Open and parse the initial page
remDr$open()
u <- \'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=¤tpage=1#journallisttable\'
remDr$navigate(u)
opage <- remDr$getPageSource()[[1]] %>% read_html(encoding =\"UTF-8\") %>% htmlParse(encoding=\'UTF-8\')
nurl <- xpathSApply(opage,\"//form[@name=\'jumppageform\']\",xmlValue) %>% str_extract(\'\\\\d+\') %>% as.numeric()
nurl <- 1:nurl
aurl <- paste0(\'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=¤tpage=\',nurl,\'#journallisttable\')
tn <- 1
an <- length(aurl)
atabl1 <- NULL
tablcol <- c(\'ISSN\',\'Journal\',\'IF2021\',\'Division\',\'Category\',\'Discipline\',\'Is_SCI\',\'OA\',\'Employment\',\'Refereeing\',\'View\')
for(turl in aurl){
remDr$navigate(turl)
tpage <- remDr$getPageSource()[[1]] %>% read_html(encoding =\"UTF-8\") %>% htmlParse(encoding=\'UTF-8\')
ttabl <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']\")[[2]] %>% readHTMLTable(header=T) %>% set_colnames(.[1,]) %>% data.table() %>% .[c(-1,-.N),-11]
names(ttabl) <- tablcol
tabbr <- ttabl$Journal
ttabl$Journal <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']/tbody/tr/td/a[@target=\'_blank\']\",xmlValue) %>% .[as.logical(1:length(.)%%2)]
ttabl$Abbr_Name <- str_remove(tabbr,ttabl$Journal)
ttabl$Url <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']/tbody/tr/td/a[@target=\'_blank\']/@href\") %>% str_remove(\'./\') %>% paste0(\'https://www.letpub.com.cn/\',.) %>% .[as.logical(1:length(.)%%2)]
ttabl[Is_SCI==\'SCISCIE\',Is_SCI:=\'SCI/SCIE\']
atabl1 <- rbind(atabl1,ttabl)
cat(sprintf(\"Summary page, [%d] pages has been crawled, a total of %d, [%.4f%%] completed\",tn,an,tn/an*100),\'\\n\')
tn <- tn+1
Sys.sleep(10)
}
return(atabl1)
}
dataset <- letpub()
write.csv(dataset, \'E:/dataset2.csv\', row.names=F)
PS:这个玩意太耗时间,相比较Python还复杂,当学习好了
© 版权声明
THE END
暂无评论内容