From 1a7270020b30951e264ccd05a5286feb1c569e93 Mon Sep 17 00:00:00 2001 From: Yesen Date: Thu, 12 Oct 2023 21:28:48 +0300 Subject: [PATCH] add new info to the file --- .../math/statistic/statistic_tests/ANOVA.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 practica/math/statistic/statistic_tests/ANOVA.py diff --git a/practica/math/statistic/statistic_tests/ANOVA.py b/practica/math/statistic/statistic_tests/ANOVA.py new file mode 100644 index 0000000..29f1cba --- /dev/null +++ b/practica/math/statistic/statistic_tests/ANOVA.py @@ -0,0 +1,91 @@ +# однофакторный дисперсионный анализ +# Исследователь набирает 30 студентов для участия в исследовании. Студентам случайным образом назначают использовать один из трех +# методов обучения в течение следующих трех недель для подготовки к экзамену. По истечении трех недель все студенты сдают одинаковый +# тест.Используйте следующие шаги, чтобы выполнить однофакторный дисперсионный анализ, чтобы определить, одинаковы ли средние баллы +# для всех трех групп. + +#1 +group1 = [85, 86, 88, 75, 78, 94, 98, 79, 71, 80] +group2 = [91, 92, 93, 85, 87, 84, 82, 88, 95, 96] +group3 = [79, 78, 88, 94, 92, 85, 83, 85, 82, 81] +from scipy.stats import f_oneway +print(f_oneway(group1, group2, group3)) # вывод: statistic = 2.3575 (F тест) pvalue = 0.1138 +if (f_oneway(group1, group2, group3))[1] < 0.5: + print ('Не можем отвергнуть нулевую теорию') +else: + print (('Можем отвергнуть нулевую теорию')) + +#2 Другая БД + +mydata = pd.read_csv('c:/Users/admin/Documents/prog/hello/practica/DataBase/day.csv') +import os +import pandas +#Changing the current working directory +os.chdir("D:/Ediwsor_Project - Bike_Rental_Count") +BIKE = pandas.read_csv("day.csv") +BIKE['holiday']=BIKE['holiday'].astype(str) +BIKE['weekday']=BIKE['weekday'].astype(str) +BIKE['workingday']=BIKE['workingday'].astype(str) +BIKE['weathersit']=BIKE['weathersit'].astype(str) +BIKE['dteday']=pandas.to_datetime(BIKE['dteday']) +BIKE['season']=BIKE['season'].astype(str) +BIKE['yr']=BIKE['yr'].astype(str) +BIKE['mnth']=BIKE['mnth'].astype(str) +print(BIKE.dtypes) + +import statsmodels.api as sm +from statsmodels.formula.api import ols + + +for x in categorical_col: + model = ols('cnt' + '~' + x, data = BIKE).fit() #Oridnary least square method + result_anova = sm.stats.anova_lm(model) # ANOVA Test + print(result_anova) + + +#3 Другая БД + +import pandas as pd +# load data file +df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t") +# reshape the d dataframe suitable for statsmodels package +df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['A', 'B', 'C', 'D']) +# replace column names +df_melt.columns = ['index', 'treatments', 'value'] + +# generate a boxplot to see the data distribution by treatments. Using boxplot, we can +# easily detect the differences between different treatments +import matplotlib.pyplot as plt +import seaborn as sns +ax = sns.boxplot(x='treatments', y='value', data=df_melt, color='#99c2a2') +ax = sns.swarmplot(x="treatments", y="value", data=df_melt, color='#7d0013') +plt.show() + +import scipy.stats as stats +# stats f_oneway functions takes the groups as input and returns ANOVA F and p value +fvalue, pvalue = stats.f_oneway(df['A'], df['B'], df['C'], df['D']) +print(fvalue, pvalue) +# 17.492810457516338 2.639241146210922e-05 + +# get ANOVA table as R like output +import statsmodels.api as sm +from statsmodels.formula.api import ols + +# Ordinary Least Squares (OLS) model +model = ols('value ~ C(treatments)', data=df_melt).fit() +anova_table = sm.stats.anova_lm(model, typ=2) +anova_table +# output (ANOVA F and p value) +# sum_sq df F PR(>F) +# C(treatments) 3010.95 3.0 17.49281 0.000026 +# Residual 918.00 16.0 NaN NaN + +# ANOVA table using bioinfokit v1.0.3 or later (it uses wrapper script for anova_lm) +from bioinfokit.analys import stat +res = stat() +res.anova_stat(df=df_melt, res_var='value', anova_model='value ~ C(treatments)') +res.anova_summary +# output (ANOVA F and p value) +# df sum_sq mean_sq F PR(>F) +# C(treatments) 3.0 3010.95 1003.650 17.49281 0.000026 +# Residual 16.0 918.00 57.375 NaN NaN \ No newline at end of file