python 数据分析案例(一)

mac2024-05-21  29

元数据 https://github.com/chrisrui/pydata-book

import json import pandas as pd import matplotlib.pyplot as plt db=json.load(open('/home/baba/database.json')) len(db) db

 

'group': 'Composition'}, {'value': 4.74, 'units': 'g', 'description': 'Carbohydrate, by difference', 'group': 'Composition'}, {'value': 0.8, 'units': 'g', 'description': 'Ash', 'group': 'Other'}, {'value': 63.0, 'units': 'kcal', 'description': 'Energy', 'group': 'Energy'}, {'value': 87.67, 'units': 'g', 'description': 'Water', 'group': 'Composition'}, {'value': 264.0, 'units': 'kJ', 'description': 'Energy', 'group': 'Energy'}, {'value': 0.0, 'units': 'g',

 

 

db[0].keys() db[0]['nutrients'][0] nutrients=pd.DataFrame(db[0]['nutrients']) nutrients[:7] info_keys=['description','group','id','manufacturer'] info_db=pd.DataFrame(db,columns=info_keys) info_db[:5 value units description group 0 25.18 g Protein Composition 1 29.20 g Total lipid (fat) Composition 2 3.06 g Carbohydrate, by difference Composition 3 3.28 g Ash Other 4 376.00 kcal Energy Energy 5 39.28 g Water Composition 6 1573.00 kJ Energy Energy] info_db.info()

 

pd.value_counts(info_db.group)[:10]

 

nutrients_list=[] for rec in db: fnuts=pd.DataFrame(rec['nutrients']) fnuts['id']=rec['id'] nutrients_list.append(fnuts) nutrients_list=pd.concat(nutrients_list,ignore_index=True) nutrients_list nutrients_list.duplicated().sum() nutrients_list=nutrients_list.drop_duplicates() col1={'description':'food', 'group':'fgroup'} info_db=info_db.rename(columns=col1,copy=False) info_db.info() co2={'description':'nutrient', 'group':'nutgroup'} nutrients_list=nutrients_list.rename(columns=co2,copy=False) nutrients_list ndata=pd.merge(nutrients_list,info_db,on='id',how='outer') ndata.info() ndata.iloc[30000] fig=plt.figure() result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].sort_values().plot(kind='barh') by_nutrietn=ndata.groupby(['nutgroup','nutrient']) get_maximum=lambda x: x.loc[x.value.idxmax()] get_minimum=lambda x: x.loc[x.value.idxmin()] max_foods=by_nutrietn.apply(get_maximum)[['value','food']] max_foods.food=max_foods[:50] max_foods.loc['Amino Acids']['food']

 

最新回复(0)