R语言爬虫——Letpub期刊信息—科研工具箱

请关注公众号【叨客学习资料】 在使用网站的过程中有疑问,请来公众号进行反馈哦

记录一下第一次用R语言爬Letpub期刊信息的代码,以后还是尽量用python吧。

library(XML)
library(RSelenium)
library(rvest)
library(stringr)
library(data.table)
library(magrittr)
library(xlsx)

letpub <- function(){
  # Do not read string as a factor!
  options(stringsAsFactors=F)
  # load selenium service
  system(\"java -jar E:/study/python/selenium-server-standalone-2.50.1.jar\", wait=F, invisible=T, minimized=T)
  # Configure the browser
  eCap  <- list(phantomjs.page.settings.userAgent=\"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0\")
  # remDr <- remoteDriver(browserName=\"chrome\", extraCapabilities=eCap)
  remDr <- remoteDriver(browserName=\"phantomjs\", extraCapabilities=eCap)
  # Open and parse the initial page
  remDr$open()
  u <- \'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=&currentpage=1#journallisttable\'
  remDr$navigate(u)
  opage <- remDr$getPageSource()[[1]] %>% read_html(encoding =\"UTF-8\") %>% htmlParse(encoding=\'UTF-8\')
  nurl  <- xpathSApply(opage,\"//form[@name=\'jumppageform\']\",xmlValue) %>% str_extract(\'\\\\d+\') %>% as.numeric() 
  nurl  <- 1:nurl
  aurl  <- paste0(\'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=&currentpage=\',nurl,\'#journallisttable\')
  tn <- 1
  an <- length(aurl)
  atabl1 <- NULL
  tablcol <- c(\'ISSN\',\'Journal\',\'IF2021\',\'Division\',\'Category\',\'Discipline\',\'Is_SCI\',\'OA\',\'Employment\',\'Refereeing\',\'View\')
  for(turl in aurl){
    remDr$navigate(turl)
    tpage <- remDr$getPageSource()[[1]] %>% read_html(encoding =\"UTF-8\") %>% htmlParse(encoding=\'UTF-8\')
    ttabl <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']\")[[2]] %>% readHTMLTable(header=T) %>% set_colnames(.[1,]) %>% data.table() %>% .[c(-1,-.N),-11]
    names(ttabl)  <- tablcol
    tabbr <- ttabl$Journal
    ttabl$Journal <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']/tbody/tr/td/a[@target=\'_blank\']\",xmlValue) %>% .[as.logical(1:length(.)%%2)]
    ttabl$Abbr_Name <- str_remove(tabbr,ttabl$Journal)
    ttabl$Url     <- xpathSApply(tpage,\"//table[@class=\'table_yjfx\']/tbody/tr/td/a[@target=\'_blank\']/@href\") %>% str_remove(\'./\') %>% paste0(\'https://www.letpub.com.cn/\',.) %>% .[as.logical(1:length(.)%%2)]
    ttabl[Is_SCI==\'SCISCIE\',Is_SCI:=\'SCI/SCIE\']
    atabl1 <- rbind(atabl1,ttabl)
    cat(sprintf(\"Summary page, [%d] pages has been crawled, a total of %d, [%.4f%%] completed\",tn,an,tn/an*100),\'\\n\')
    tn <- tn+1
    Sys.sleep(10)
  }
  return(atabl1)
}


dataset <- letpub()
write.csv(dataset, \'E:/dataset2.csv\', row.names=F)

PS:这个玩意太耗时间,相比较Python还复杂,当学习好了

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发
头像
请输入有效评论哦,肆意灌水或者乱打评论是不会通过的,会影响您评论后获得资源哦~~
提交
头像

昵称

取消
昵称表情

    暂无评论内容