小代码写了就扔,下次再用再花个1个小时从头写真是头疼。
多少次百度pyhon+正则式
都记不清了————,
引用牛博士的话说:“各有慧根”。
唉…书呆很呆。
批量从Pubmed网站下载文献摘要、Mesh Terms有木有?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#coding: utf-8
"""
Function: Download literature base on PubMed ID
Author: Chu Yanshuo
Version: Python 2.75
Contact: chu at dustincys.github.io
"""
import sys
import xml.dom.minidom
from urllib import urlopen
import re
def getWebPage(url):
wp = urlopen(url)
return wp.read()
pass
def urlGenerator(id):
return "http://www.ncbi.nlm.nih.gov/pubmed/{}?report=xml&format=text".format(id)
pass
def htmlunquote(s):
import HTMLParser
html_parser = HTMLParser.HTMLParser()
r = html_parser.unescape(s)
return r
def getXMLBody(webpage):
page=str(webpage)
start=page.find("<pre>")
end=page.find("</pre>")
page=page[start+5:end]
page=htmlunquote(page)
return page
def ncbifetch(id):
return getXMLBody(getWebPage(urlGenerator(id)))
def generateXMLFile(idFilePath,xmlFilePath):
idFile=open(idFilePath)
idPattern=re.compile(r"[0-9]+")
for line in idFile:
print line
line=line.strip()
if line == "":
continue
idList=idPattern.findall(line)
for idItem in idList:
saveXmlFile = xmlFilePath + idItem + ".xml"
xmlFile=open(saveXmlFile,'w')
xmlFile.write('<?xml version="1.0" encoding="utf-8"?>\n<root>')
xmlFile.write(ncbifetch(idItem))
xmlFile.write("</root>")
xmlFile.close()
pass
pass
def main():
#generateXMLFile(idFilePath,xmlFilePath)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
main()