全球主要大学数据格式化工具
主要应用于地理位置选择界面,网上有个19兆的包含23万条数据的txt学校地址及信息数据库,以下这个python脚本可以帮你把咋个庞大的txt数据库通过分类格式化为较小的文件,提高索引效率。
# -*- coding: utf-8 -*-
import sys,os,codecs
from urllib import quote
reload(sys)
sys.setdefaultencoding("utf-8");
edudatas = os.path.join(os.getcwd(),"edu")
if not os.path.exists(edudatas):
os.mkdir(edudatas)
schoollist = os.path.join(edudatas,"schoollist")
if not os.path.exists(schoollist):
os.mkdir(schoollist)
campus = os.path.join(schoollist,"campus")
if not os.path.exists(campus):
os.mkdir(campus)
middleschool = os.path.join(schoollist,"middleschool")
if not os.path.exists(middleschool):
os.mkdir(middleschool)
primaryschool = os.path.join(schoollist,"primaryschool")
if not os.path.exists(primaryschool):
os.mkdir(primaryschool)
line_ttl = sum(1 for line in open("allschool.txt"))
line_count = 0
def main():
for line in open("allschool.txt",'r').readlines():
global line_count,line_ttl
line_count += 1
percentage = 100*float(line_count)/line_ttl
sys.stdout.write("%%%2.2f"%percentage+"\r")
row = line.split("\t")
school_id = row[0]#学校id
school_name_start_letter = row[1]#学校首字母起始
school_name = row[2]#学校名字
school_type = int(row[3])#学校类型 1 小学 2 中学 3 大学/中专
campus_extype = row[4]# 0 专科 1 普通本科 2 重点本科
country = row[5]#国家代码
country_name = row[6]#国家名
province = row[7]#省级代码
province_name = row[8]#省级名
city = row[9]#城市代码
city_name = row[10]#城市名
district = row[11]#县或镇代码
district_name = row[12]#县或镇名
file_name = "_".join([ quote(name) for name in [country_name,province_name,city_name] if name != "null"])+".list"
file_path = None
if school_type == 1:
file_path = os.path.join(primaryschool,file_name)
elif school_type == 2:
file_path = os.path.join(middleschool,file_name)
elif school_type == 3:
file_path = os.path.join(campus,file_name)
if not file_path:
continue
filehandler = codecs.open(file_path,"a+","utf-8")
filehandler.write( ":".join((school_name,school_id,school_name_start_letter.upper()))+"," )
filehandler.close()
if __name__ == "__main__":
main()
标签: python
全球主要大学数据格式化工具
http://hellohtml5.blogspot.com/2011/03/school-list-all-over-world-generator.html
没有评论:
发表评论