目前,我正在处理从交叉参考API中检索到的一个大型数据集,在该数据集中,我根据DOI搜索检索了科学论文上的信息。
目前,大列表包含大约3500个元素。每个元素都是由元数据‘元’、实际相关的数据‘数据’和一个不相关的列表‘方面’组成的自己的列表。
这是基于两个DOI的两个列表的一个例子:
list(`10.1158/1055-9965.EPI-08-0303` = list(meta = NULL, data = structure(list(
alternative.id = "10.1158/1055-9965.EPI-08-0303", container.title = "Cancer Epidemiology Biomarkers & Prevention",
created = "2008-11-06", deposited = "2020-12-24", published.print = "2008-11",
published.online = "2008-11-06", doi = "10.1158/1055-9965.epi-08-0303",
indexed = "2021-10-17", issn = "1055-9965,1538-7755", issue = "11",
issued = "2008-11", member = "1086", page = "3216-3223",
prefix = "10.1158", publisher = "American Association for Cancer Research (AACR)",
score = "1", source = "Crossref", reference.count = "31",
references.count = "31", is.referenced.by.count = "50", subject = "Oncology,Epidemiology",
title = "20 Years into the Gambia Hepatitis Intervention Study: Assessment of Initial Hypotheses and Prospects for Evaluation of Protective Effectiveness Against Liver Cancer",
type = "journal-article", url = "http://dx.doi.org/10.1158/1055-9965.epi-08-0303",
volume = "17", language = "en", short.container.title = "Cancer Epidemiol Biomarkers Prev",
author = list(structure(list(given = c("Simonetta", "Patrizia",
"Ebrima", "Andrew J.", "Gregory D.", "Maimuna", "Ruggero",
"Amelie", "Omar", "Marianne", "Hilton", "Pierre"), family = c("Viviani",
"Carrieri", "Bah", "Hall", "Kirk", "Mendy", "Montesano",
"Plymoth", "Sam", "Van der Sande", "Whittle", "Hainaut"),
sequence = c("first", "additional", "additional", "additional",
"additional", "additional", "additional", "additional",
"additional", "additional", "additional", "additional"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L))), link = list(structure(list(URL = "https://syndication.highwire.org/content/doi/10.1158/1055-9965.EPI-08-0303",
content.type = "unspecified", content.version = "vor",
intended.application = "similarity-checking"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L)))), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL),
`10.1016/j.canlet.2007.10.044` = list(meta = NULL, data = structure(list(
alternative.id = "S0304383507005253", container.title = "Cancer Letters",
created = "2008-01-14", deposited = "2019-01-01", published.print = "2008-03",
doi = "10.1016/j.canlet.2007.10.044", indexed = "2021-10-07",
issn = "0304-3835", issue = "1", issued = "2008-03",
member = "78", page = "21-25", prefix = "10.1016", publisher = "Elsevier BV",
score = "1", source = "Crossref", reference.count = "20",
references.count = "20", is.referenced.by.count = "71",
subject = "Cancer Research,Oncology", title = "Detection of R337H, a germline TP53 mutation predisposing to multiple cancers, in asymptomatic women participating in a breast cancer screening program in Southern Brazil",
type = "journal-article", url = "http://dx.doi.org/10.1016/j.canlet.2007.10.044",
volume = "261", language = "en", short.container.title = "Cancer Letters",
author = list(structure(list(given = c("Edenir Inêz",
"Lavínia", "Maira", "Maria Isabel Waddington", "Magali",
"Ghyslaine", "Virginie", "Ernestina", "Juliana", "Ingrid Petroni",
"Roberto", "Pierre", "Patricia"), family = c("Palmero",
"Schüler-Faccini", "Caleffi", "Achatz", "Olivier", "Martel-Planche",
"Marcel", "Aguiar", "Giacomazzi", "Ewald", "Giugliani",
"Hainaut", "Ashton-Prolla"), sequence = c("first", "additional",
"additional", "additional", "additional", "additional",
"additional", "additional", "additional", "additional",
"additional", "additional", "additional")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -13L))), link = list(
structure(list(URL = c("https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/xml",
"https://api.elsevier.com/content/article/PII:S0304383507005253?httpAccept=text/plain"
), content.type = c("text/xml", "text/plain"), content.version = c("vor",
"vor"), intended.application = c("text-mining", "text-mining"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))), license = list(structure(list(date = "2008-03-01",
content.version = "tdm", delay.in.days = 0L, URL = "https://www.elsevier.com/tdm/userlicense/1.0/"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L))), reference = list(
structure(list(key = c("10.1016/j.canlet.2007.10.044_bib1",
"10.1016/j.canlet.2007.10.044_bib2", "10.1016/j.canlet.2007.10.044_bib3",
"10.1016/j.canlet.2007.10.044_bib4", "10.1016/j.canlet.2007.10.044_bib5",
"10.1016/j.canlet.2007.10.044_bib6", "10.1016/j.canlet.2007.10.044_bib7",
"10.1016/j.canlet.2007.10.044_bib8", "10.1016/j.canlet.2007.10.044_bib9",
"10.1016/j.canlet.2007.10.044_bib10", "10.1016/j.canlet.2007.10.044_bib11",
"10.1016/j.canlet.2007.10.044_bib12", "10.1016/j.canlet.2007.10.044_bib13",
"10.1016/j.canlet.2007.10.044_bib14", "10.1016/j.canlet.2007.10.044_bib15",
"10.1016/j.canlet.2007.10.044_bib16", "10.1016/j.canlet.2007.10.044_bib17",
"10.1016/j.canlet.2007.10.044_bib18", "10.1016/j.canlet.2007.10.044_bib19",
"10.1016/j.canlet.2007.10.044_bib20"), doi.asserted.by = c("crossref",
"crossref", NA, NA, NA, "crossref", NA, NA, "crossref",
"crossref", "crossref", NA, NA, NA, "crossref", "crossref",
"crossref", "crossref", "crossref", NA), first.page = c("96",
"266", "1298", "877s", "1932", "12", "10", "608",
"2658", "1954", "133", "1703", "1365", "5358", "1215",
"607", "647", "607", "9330", "1213"), DOI = c("10.1016/j.canlet.2005.12.039",
"10.1159/000154228", NA, NA, NA, "10.1038/nsb730",
NA, NA, "10.1002/1097-0142(19901215)66:12<2658::AID-CNCR2820661232>3.0.CO;2-C",
"10.1038/sj.onc.1207305", "10.1016/0165-4608(93)90166-J",
NA, NA, NA, "10.1093/nar/16.3.1215", "10.1002/humu.10081",
"10.1590/S0004-27302004000500009", "10.1007/BF00202835",
"10.1073/pnas.161479898", NA), article.title = c("The TP53 mutation, R337H, is associated with Li–Fraumeni and Li–Fraumeni-like syndromes in Brazilian families",
"Is p53 polymorphism maintained by natural selection?",
"Prevalence and diversity of constitutional mutations in the p53 gene among 21 Li–Fraumeni families",
"Breast cancer screening in 10,000 women of an underserved population in South Brazil: The NMAMAPOA cohort",
"P53 germline mutations in childhood cancers and cancer risk for carrier individuals",
"A novel mechanism of tumorigenesis involving pH-dependent destabilization of a mutant p53 tetramer",
"Germline mutations in the TP53 gene", "Germ-line p53 mutations in 15 families with Li–Fraumeni syndrome",
"Choroid plexus tumors in the breast cancer-sarcoma syndrome",
"A TP53 polymorphism is associated with increased risk of colorectal cancer and with reduced levels of TP53 mRNA",
"Wilms’ tumor in the Li–Fraumeni cancer family syndrome",
"Simple sequence repeat polymorphism within the p53 gene",
"Rhabdomyosarcoma in children: epidemiologic study and identification of a familial cancer syndrome",
"A cancer family syndrome in twenty-four kindreds",
"A simple salting out procedure for extracting DNA from human nucleated cells",
"The IARC TP53 database: new online mutation analysis and recommendations to users",
"Founder effect for the highly prevalent R337H mutation of tumor suppressor p53 in Brazilian patients with adrenocortical tumors",
"Identification of a polymorphism in intron 2 of the p53 gene",
"An inherited p53 mutation that contributes in a tissue-specific manner to pediatric adrenal cortical carcinoma",
"Cancer in survivors of childhood soft tissue sarcoma and their relatives"
), volume = c("245", "44", "54", "23", "82", "9",
"25", "56", "66", "23", "67", "8", "43", "48", "16",
"19", "48", "93", "98", "79"), author = c("Achatz",
"Beckman", "Birch", "Caleffi", "Chompret", "DiGiammarino",
"Eeles", "Frebourg", "Garber", "Gemignani", "Hartley",
"Lazar", "Li", "Li", "Miller", "Olivier", "Pinto",
"Pleasants", "Ribeiro", "Strong"), year = c("2007",
"1994", "1994", "2005", "2000", "2002", "1995", "1995",
"1990", "2004", "1993", "1993", "1969", "1988", "1988",
"2002", "2004", "1994", "2001", "1987"), journal.title = c("Cancer Lett.",
"Hum. Hered.", "Cancer Res.", "J. Clin. Oncol.",
"Br. J. Cancer", "Nat. Struct. Biol.", "Cancer Surv.",
"Am. J. Hum. Genet.", "Cancer", "Oncogene", "Cancer Genet. Cytogenet.",
"Oncogene", "J. Natl. Cancer Inst.", "Cancer Res.",
"Nucleic Acids Res.", "Hum. Mutat.", "Arq. Bras. Endocrinol. Metabol.",
"Hum. Genet.", "Proc. Natl. Acad. Sci. USA", "J. Natl. Cancer Inst."
), issue = c(NA, "5", "5", "16 S", "12", "1", NA,
"3", "12", "10", "2", "6", "6", "18", "3", "6", "5",
"5", "16", "6")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L)))), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -1L)), facets = NULL))“data”下的所有数据都与我相关,我想取消这些数据,用DOI在一列中构造一个大型数据,在“data”下构造另一列下的数据。
我尝试过使用unnest代码unnest(data),但这会导致Error in UseMethod("unnest") : no applicable method for 'unnest' applied to an object of class "list"
有什么简单的方法吗?
发布于 2021-10-25 16:20:49
是像这样吗?注意--最好包含一个包含玩具数据集的极小展宽,而不是包含您所拥有的东西的快照。这样,问题就能更快地得到答案。
ll <- list(`10.1016/j.ejca.2017.11.029` = list(metadata = NULL,
data = tibble(one = 1, two = 2)),
`10.1016/j.ejca.2017.12.500` = list(metadata = NULL,
data = tibble(one = 3, two = 4)))
nms <- names(ll)
vals <- lapply(ll, `[[`, 2)
tibble::tibble(DOI = nms,
data = vals)
# or shorten to
tibble::tibble(DOI = names(ll),
data = lapply(ll, `[[`, 2))
# A tibble: 2 x 2
DOI data
<chr> <named list>
1 10.1016/j.ejca.2017.11.029 <tibble [1 x 2]>
2 10.1016/j.ejca.2017.12.500 <tibble [1 x 2]>在这里,名称包含信息。相关信息是第二个列表元素。lapply(ll, [[, 2)相当于c(ll[[1]][[2]], ll[[2]][[2]], ... ) ]
https://stackoverflow.com/questions/69711158
复制相似问题