您现在的位置是：首页 > 文章详情

python数据清洗excel

日期：2018-09-10点击：526收藏

python清洗excel的数据还是很简单的
这里就列举例子说一下
这是原始数据，这里要处理的是地区和薪水两个字段。

清洗前数据

import xlrd import codecs import re def get_salary(salary): # 利用正则表达式提取月薪，把待遇规范成千/月的形式 # 返回最低工资，最高工资的形式 if '-' in salary: # 针对1-2万/月或者10-20万/年的情况，包含- low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0] high_salary = re.findall(re.compile('(\d?\.?\d+)'), salary)[1] if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式 low_salary = float(low_salary) / 12 * 10 high_salary = float(high_salary) / 12 * 10 elif u'万' in salary and u'月' in salary: low_salary = float(low_salary) * 10 high_salary = float(high_salary) * 10 else: # 针对20万以上/年和100元/天这种情况，不包含-，取最低工资，没有最高工资 low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0] high_salary = "" if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式 low_salary = float(low_salary) / 12 * 10 elif u'万' in salary and u'月' in salary: low_salary = float(low_salary) * 10 elif u'元' in salary and u'天' in salary: low_salary = float(low_salary) / 1000 * 21 # 每月工作日21天 return low_salary, high_salary def open_xlsx(file): # 加载Excel数据，获得工作表和行数 data = xlrd.open_workbook(file) #读取工作表名称 table0 = data.sheet_by_name('51') #读取 当前sheet表 nrows = table0.nrows # 获取行数 return table0, nrows def main(): table, nrows = open_xlsx('512.xlsx') # 调用打开excel的函数 print('一共有{}行数据，开始清洗数据'.format(nrows)) for i in range(1, nrows): job = table.row_values(i)[0] company = table.row_values(i)[1] companytype = table.row_values(i)[2] area = table.row_values(i)[3][:2] # 地区取到城市，把区域去掉 if area: area_list.append(area) experience = table.row_values(i)[4] degree = table.row_values(i)[5] salary = table.row_values(i)[6] if salary: # 如果待遇这栏不为空，计算最低最高待遇 getsalary = get_salary(salary) low_salary = getsalary[0] high_salary = getsalary[1] else: low_salary = high_salary = "" print('正在写入第{}条，最低工资是{}k,最高工资是{}k'.format(i, low_salary, high_salary)) output = ('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n').format(job, company, companytype, area, experience,degree, low_salary, high_salary ) f = codecs.open('51jobanaly.xls', 'a+') f.write(output) f.close() if __name__ == '__main__': main()

主要把薪资处理成以千/月为单位。保留城市。
处理后的数据：