import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/bigdata-csvfile/loan_data.csv')
df['총대출액'] = df['대출A'] + df['대출B']
gender_0 = df[df['성별'] == 0]
gender_1 = df[df['성별'] == 1]
cond0 = gender_0.groupby('지역코드')['총대출액'].sum()
cond1 = gender_1.groupby('지역코드')['총대출액'].sum()
result = np.abs(cond0 - cond1)
print(result.idxmax()) # 1000009974
import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/bigdata-csvfile/loan_data.csv')
df['총대출액'] = df['대출A'] + df['대출B']
df = df.groupby(['지역코드','성별'])['총대출액'].sum().unstack()
df['금액차이'] = np.abs(df[0] - df[1])
df = df.sort_values(by = '금액차이', ascending = False)
print(df) # 1000009974
import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/bigdata-csvfile/crime_data_2014_2020.csv')
발생건수 = df[df['구분'] == '발생건수'].drop(columns = '구분').groupby('년도').sum()
검거건수 = df[df['구분'] == '검거건수'].drop(columns = '구분').groupby('년도').sum()
검거율 = 검거건수 / 발생건수
검거율_max = (검거율.idxmax(axis=1))
result = 0
for year in 검거건수.index:
crime = 검거율_max[year]
result += 검거건수.loc[year, crime]
print(result)
import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/bigdata-csvfile/crime_data_2014_2020.csv')
# 각 연도별로 각 범죄의 검거건수를 발생건수로 나눈 검거율 계산
검거건수 = df[df['구분'] == '검거건수']
발생건수 = df[df['구분'] == '발생건수']
검거건수 = 검거건수.groupby('년도').sum().drop(columns = '구분')
발생건수 = 발생건수.groupby('년도').sum().drop(columns = '구분')
rate = (검거건수 / 발생건수)
rate = rate.idxmax(axis = 1)
'''
2014 범죄28
2015 범죄24
2016 범죄29
2017 범죄10
2018 범죄19
2019 범죄19
2020 범죄4
'''
검거_2014 = df.loc[(df['년도'] == 2014),'범죄28'].max()
검거_2015 = df.loc[(df['년도'] == 2015),'범죄24'].max()
검거_2016 = df.loc[(df['년도'] == 2016),'범죄29'].max()
검거_2017 = df.loc[(df['년도'] == 2017),'범죄10'].max()
검거_2018 = df.loc[(df['년도'] == 2018),'범죄19'].max()
검거_2019 = df.loc[(df['년도'] == 2019),'범죄19'].max()
검거_2020 = df.loc[(df['년도'] == 2020),'범죄4'].max()
print(검거_2014 + 검거_2015 + 검거_2016 + 검거_2017 + 검거_2018 + 검거_2019 + 검거_2020)
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') # 경고 메시지 무시
df = pd.read_csv('/kaggle/input/bigdata-csvfile/employee_data_with_grades.csv')
# 1.
# 연봉 결측채 : 연봉평균
df['연봉'] = df['연봉'].fillna(df['연봉'].mean())
# 근속년수 : 부서명과 근무평가등급이 같은 직원의 근속년수 평균
df['근속년수'] = df.groupby(['부서명', '근무평가등급'])['근속년수'].transform(lambda x : x.fillna(x.mean()))
# 2.
df['연봉_근속'] = df['연봉'] / df['근속년수']
df['연봉_고객'] = df['연봉'] / df['고객만족점수']
a = df.sort_values(by ='연봉_근속', ascending=False).reset_index().iloc[2]['근속년수']
b = df.sort_values(by ='연봉_고객', ascending=False).reset_index().iloc[1]['교육참가시간수']
print(int(a + b))
import pandas as pd
import numpy as np
df = pd.read_csv('/kaggle/input/bigdata-csvfile/real_estate_data.csv')
train = df.iloc[:800]
test = df.iloc[800:]
y = train['주택가격_만원']
# 유의미한 독립변수
from statsmodels.formula.api import ols
formula = ('주택가격_만원 ~ 주택면적_제곱미터 + 건축년도 + 방_수 + 화장실_수 + 층수')
model = ols(formula,data=train).fit()
# print(model.pvalues < 0.05) # 2개 주택면적_제곱미터, 건축년도
from statsmodels.formula.api import ols
formula2 = ('주택가격_만원 ~ 주택면적_제곱미터 + 건축년도')
model2 = ols(formula2,data=train).fit()
pred = model2.predict(train)
from scipy import stats
st, pv = stats.pearsonr(y, pred)
print(round(st, 3)) #피어슨 상관계수
from sklearn.metrics import mean_squared_error
test_y = test['주택가격_만원']
pred_test = model2.predict(test)
rmse = mean_squared_error(test_y,pred_test, squared=False)
print(round(rmse, 4))
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.formula.api import logit
df = pd.read_csv('/kaggle/input/bigdata-csvfile/loan_approval_data.csv')
# 1.
formula = ('대출승인여부 ~ 나이 + 소득_만원 + 근무시간_주당 + 자녀수 + 신용등급')
model = logit(formula, data = df).fit()
print(round(model.pvalues['근무시간_주당'],3))
# 2.
print(np.exp(model.params['자녀수']))
# 3.
pred = model.predict(df)
print((pred > 0.3).sum())