This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 as ul2 | |
import urlparse as up | |
import re | |
import sys | |
import webbrowser as wb | |
jissite="http://kikakurui.com/" | |
home=ul2.urlopen(jissite).read().split('\n') | |
pat=re.compile('<a href=".*index\.html">') | |
linktojis=filter(lambda x:pat.search(x) is not None,home) | |
linkdict={} | |
def jis(cha,num): | |
''' | |
search page for 'JIS cha num' from jissite | |
''' | |
cha=cha.upper() | |
rangepat=re.compile('{} (\d{{4}}-\d{{4}})</a>'.format(cha)) | |
path=None | |
for p in linktojis: | |
s=rangepat.search(p) | |
if s is not None: | |
rg=s.groups()[0].split('-') | |
if int(rg[0])<=num<=int(rg[1]): | |
path=p.split('"')[3] | |
break | |
if path is None: | |
print "Cannot find pages for JIS {} {}".format(cha,num) | |
return | |
url1=up.urljoin(jissite,path) | |
try: | |
nextpage=linkdict[url1] | |
except KeyError: | |
nextpage=ul2.urlopen(url1).read().split('\n') | |
linkdict[url1]=nextpage | |
for i,j in enumerate(nextpage): | |
if '{}{}'.format(cha,num) in j: | |
print u"JIS {} {}: {}".format(cha,num,nextpage[i+1].split('>')[1].split('<')[0].decode('utf-8')) | |
return up.urljoin(url1,j.split('"')[1]) | |
def parsepage(url): | |
''' | |
split original html file into each page. | |
deleting garbages | |
''' | |
pass | |
if __name__=='__main__': | |
if len(sys.argv)>=3: | |
fig=sys.argv[1] | |
try: | |
num=int(sys.argv[2]) | |
except ValueError: | |
sys.exit() | |
else: | |
fig=raw_input("Character: ") | |
try: | |
num=int(raw_input("Number: ")) | |
except ValueError: | |
sys.exit() | |
wb.open(jis(fig,num)) |
文字と数字から当該規格のurlを取得。
取り敢えずそのurlを既定のブラウザで開くようにしている。
Usage: python jis.py Z 3321
=>
JIS Z 3321: 溶接用ステンレス鋼溶加棒,ソリッドワイヤ及び鋼帯
http://kikakurui.com/z3/Z3321-2010-01.html
将来的にはhtmlから広告部分等を取り除いてページ毎に分割し、
pdfで出力するようにしたい。
No comments:
Post a Comment