python_math_stat/practica/math/statistic/statistic_tests/ANOVA.py
2023-10-12 21:28:48 +03:00

91 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# однофакторный дисперсионный анализ
# Исследователь набирает 30 студентов для участия в исследовании. Студентам случайным образом назначают использовать один из трех
# методов обучения в течение следующих трех недель для подготовки к экзамену. По истечении трех недель все студенты сдают одинаковый
# тест.Используйте следующие шаги, чтобы выполнить однофакторный дисперсионный анализ, чтобы определить, одинаковы ли средние баллы
# для всех трех групп.
#1
group1 = [85, 86, 88, 75, 78, 94, 98, 79, 71, 80]
group2 = [91, 92, 93, 85, 87, 84, 82, 88, 95, 96]
group3 = [79, 78, 88, 94, 92, 85, 83, 85, 82, 81]
from scipy.stats import f_oneway
print(f_oneway(group1, group2, group3)) # вывод: statistic = 2.3575 (F тест) pvalue = 0.1138
if (f_oneway(group1, group2, group3))[1] < 0.5:
print ('Не можем отвергнуть нулевую теорию')
else:
print (('Можем отвергнуть нулевую теорию'))
#2 Другая БД
mydata = pd.read_csv('c:/Users/admin/Documents/prog/hello/practica/DataBase/day.csv')
import os
import pandas
#Changing the current working directory
os.chdir("D:/Ediwsor_Project - Bike_Rental_Count")
BIKE = pandas.read_csv("day.csv")
BIKE['holiday']=BIKE['holiday'].astype(str)
BIKE['weekday']=BIKE['weekday'].astype(str)
BIKE['workingday']=BIKE['workingday'].astype(str)
BIKE['weathersit']=BIKE['weathersit'].astype(str)
BIKE['dteday']=pandas.to_datetime(BIKE['dteday'])
BIKE['season']=BIKE['season'].astype(str)
BIKE['yr']=BIKE['yr'].astype(str)
BIKE['mnth']=BIKE['mnth'].astype(str)
print(BIKE.dtypes)
import statsmodels.api as sm
from statsmodels.formula.api import ols
for x in categorical_col:
model = ols('cnt' + '~' + x, data = BIKE).fit() #Oridnary least square method
result_anova = sm.stats.anova_lm(model) # ANOVA Test
print(result_anova)
#3 Другая БД
import pandas as pd
# load data file
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
# reshape the d dataframe suitable for statsmodels package
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['A', 'B', 'C', 'D'])
# replace column names
df_melt.columns = ['index', 'treatments', 'value']
# generate a boxplot to see the data distribution by treatments. Using boxplot, we can
# easily detect the differences between different treatments
import matplotlib.pyplot as plt
import seaborn as sns
ax = sns.boxplot(x='treatments', y='value', data=df_melt, color='#99c2a2')
ax = sns.swarmplot(x="treatments", y="value", data=df_melt, color='#7d0013')
plt.show()
import scipy.stats as stats
# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(df['A'], df['B'], df['C'], df['D'])
print(fvalue, pvalue)
# 17.492810457516338 2.639241146210922e-05
# get ANOVA table as R like output
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Ordinary Least Squares (OLS) model
model = ols('value ~ C(treatments)', data=df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
# output (ANOVA F and p value)
# sum_sq df F PR(>F)
# C(treatments) 3010.95 3.0 17.49281 0.000026
# Residual 918.00 16.0 NaN NaN
# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm)
from bioinfokit.analys import stat
res = stat()
res.anova_stat(df=df_melt, res_var='value', anova_model='value ~ C(treatments)')
res.anova_summary
# output (ANOVA F and p value)
# df sum_sq mean_sq F PR(>F)
# C(treatments) 3.0 3010.95 1003.650 17.49281 0.000026
# Residual 16.0 918.00 57.375 NaN NaN