{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "colab": { "name": "Process_for_structure_study_pm.ipynb", "provenance": [], "collapsed_sections": [], "machine_shape": "hm", "include_colab_link": true }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "ogkc-BrPCjKK", "colab_type": "text" }, "source": [ "# Études *Pattern Mining*" ] }, { "cell_type": "markdown", "metadata": { "id": "YkwFVjTICjKO", "colab_type": "text" }, "source": [ "### Imports" ] }, { "cell_type": "code", "metadata": { "id": "tQ0Qo-WSCjKT", "colab_type": "code", "colab": {} }, "source": [ "import pandas as pd \n", "import numpy as np\n", "import os" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CUPSnJ-5lY0Y", "colab_type": "code", "colab": {} }, "source": [ "def group_age(ages):\n", " \"\"\"\n", " Groupe les âges par année et mois\n", " \"\"\"\n", " ages_group = []\n", " for i in range(len(ages)) :\n", " mois = int(ages[i][2:4])\n", " jours = int(ages[i][-2:])\n", " if jours < 15 :\n", " ages_group.append(ages[i][:4])\n", " else:\n", " mois += 1\n", " if mois < 10 :\n", " ages_group.append(ages[i][:2]+'0'+str(mois))\n", " else:\n", " ages_group.append(ages[i][:2]+str(mois))\n", " return ages_group\n", "#######################\n", "def group_an_age(ages):\n", " \"\"\"\n", " Groupe les âges par demi-année\n", " \"\"\"\n", " ages_group = []\n", " for i in range(len(ages)) :\n", " an = int(ages[i][0])\n", " mois = int(ages[i][2:4])\n", " if mois < 6 :\n", " ages_group.append(str(an)+'_00')\n", " else:\n", " ages_group.append(str(an)+'_06')\n", "\n", " return ages_group\n", "#######################\n", "def print_age(age):\n", " \"\"\"\n", " Permet de rendre l'âge plus facilement lisible dans les visus\n", " \"\"\"\n", " age_print = ''\n", " an = int(age[0])\n", " mois = int(age[2:4])\n", " if len(age) == 7:\n", " jours = int(age[-2:])\n", " elif len(age) == 4:\n", " jours = 0\n", " \n", " if an == 1:\n", " age_print +=str(an)+'an '\n", " elif an != 0:\n", " age_print +=str(an)+'ans '\n", " if mois !=0 :\n", " age_print +=str(mois)+'mois '\n", " if jours == 1:\n", " age_print +=str(jours)+'jour'\n", " elif jours != 0 :\n", " age_print +=str(jours)+'jours'\n", " return age_print" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hgz92tZhCjKf", "colab_type": "text" }, "source": [ "### Recuperation et preparation des donnnees" ] }, { "cell_type": "code", "metadata": { "id": "P3ag6pRgCjKi", "colab_type": "code", "colab": {} }, "source": [ "def load_data(nom_enfant):\n", " \"\"\"\n", " Récupération et preparation des données pour un enfant donnée en paramètre\n", " \"\"\"\n", " #Chargement du jeu de donnees\n", " data_child = pd.read_csv('/content/data_'+nom_enfant+'_final.csv',\n", " sep = '\\t',\n", " encoding = 'utf-8',\n", " index_col = False)\n", "\n", " # Supression de la colonne qui duplique l'index\n", " data_child = data_child.drop(columns = 'Unnamed: 0') \n", "\n", " # Recuperation de la partie phonetique des intervention de l'enfant\n", " data_child = data_child.where(data_child['type']=='pho').dropna(subset=['type'])\n", "\n", " # Conservation des deux colonnes avec des informations pertinantes \n", " data_child = data_child[['age','contenu']]\n", "\n", " # Creer un indexage a partir de cet ordre\n", " data_child['index']=[ i for i in range(len(data_child))]\n", " data_child= data_child.set_index('index')\n", "\n", " # Creation d'un numpy utile par la suite pour les boucles\n", " ages = data_child.age.unique()\n", "\n", " # Indexage par l'age de l'enfant\n", " age = data_child['age']\n", "\n", " l=[i for i in range(len(age))]\n", " k=[age.values]\n", " k.append(l)\n", " tuples = list(zip(*k))\n", "\n", " index = pd.MultiIndex.from_tuples(tuples, names=['age', 'index'])\n", " data_child.set_index(index,inplace=True)\n", "\n", " # Supression de la colonne age car present en index\n", " data_child=data_child[['contenu']]\n", "\n", " data_child['contenu']=data_child['contenu'].apply(lambda x: str(x))\n", " ages = list(ages)\n", " \n", " return data_child,ages" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "awslGtRmCjKr", "colab_type": "text" }, "source": [ "### Demarche pour decomposer les mots en phonemes et les indexer" ] }, { "cell_type": "code", "metadata": { "id": "xdZO2W1fPHxq", "colab_type": "code", "colab": {} }, "source": [ "IPA = \"a,r,l,e,s,i,ɛ,ə,t,k,p,d,m,ɑ̃,n,u,v,o,y,ɔ̃,ʒ,ɔ,ɛ̃,f,b,j,w,ɥ,z,ø,ʃ,œ̃,œ,g,ɑ,ɲ,tr,kr,dr,gr,br,pr,fr\".split(',')\n", "ipa_1char = []\n", "ipa_2char = []\n", "for i in IPA :\n", " if len(i) !=1:\n", " ipa_2char.append(i)\n", " else:\n", " ipa_1char.append(i)\n", "\n", "def cut_pho(mot):\n", " \"\"\"\n", " Décortique le mot en phonème\n", " \"\"\"\n", " mot = mot.lower().replace('ː', '').replace('ʔ', '').replace('›', '').replace('ʁ','r')\n", " i = 0\n", " m = []\n", " while i < len(mot):\n", " if mot[i:i+2] in list(ipa_2char):\n", " #print('ATTENTION',mot[i:i+2])\n", " m.append(mot[i:i+2])\n", " i += 2\n", " elif mot[i] in list(ipa_1char):\n", " m.append(mot[i])\n", " i += 1\n", " else: # permet de virer les caractere ne fesant pas partie du dic IPA\n", " i += 1\n", " return m" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QJkxNVqFCPeW", "colab_type": "code", "colab": {} }, "source": [ "IPA_filtre = \"r,t,k,p,d,f,b,g,tr,kr,dr,gr,br,pr,fr\".split(',')\n", "def pho_filtre(mot):\n", " \"\"\"\n", " Décortique le mot en phonème et vérifie qu'il contient au moins un phoneme que l'on souhaite observer\n", " \"\"\"\n", " for i in cut_pho(mot):\n", " if i in IPA_filtre:\n", " return cut_pho(mot)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "f4RxpGNrURiQ", "colab_type": "code", "colab": {} }, "source": [ "def idx_pos(mot):\n", " \"\"\"\n", " Indexe la position du phonème dans le mot\n", " \"\"\"\n", " m = []\n", " for i in range(len(mot)): # On parcours chaque charactère du mot\n", " suff = '_d' # On considère que le 1er charactère lu est au début du mot\n", " if(i>0): # Ensuite on considère que tout les autres charactères sont en milieu de mot\n", " suff = '_m'\n", " if(i==len(mot)-1): # Sauf pour le dernier charactère lu\n", " suff = '_f' \n", " m.append(mot[i]+suff) # On ajouter le phonème indexé \n", " \n", " return m" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Q_0_W7jsCjLK", "colab_type": "text" }, "source": [ "### Addaptation de la structure" ] }, { "cell_type": "code", "metadata": { "id": "A9itJNpJCjKu", "colab_type": "code", "colab": {} }, "source": [ "def structure_pm(data_child,ages):\n", " \"\"\"\n", " Décortique les données en mots puis en phonèmes, \n", " tout en respectant la structure nécessaire pour la librairie pymining\n", " \"\"\"\n", " # Sépare en mots au lieu de phrases\n", " mots = {}\n", " for i in list(ages):\n", " # recupere toutes les phrases a la i eme date\n", " phrases = data_child['contenu'].loc[i].values\n", " # creer une variable texte avec toutes les phrases\n", " phrase = ' '.join(phrases)\n", " # sépare en mot\n", " mots[i] = phrase.split()\n", " for j in range(len(mots[i])):\n", " #mots[i][j] = [mots[i][j]] # pour etude du mot\n", " mots[i][j] = idx_pos(cut_pho(mots[i][j])) # pour etude des phonemes dans un mot indexé par position \n", " #mots[i][j] = list(mots[i][j]) # pour etude des lettres dans un mot\n", " mots[i] = tuple(mots[i])\n", "\n", " return mots\n", "\n", " # se parcourt avec un age en parametre\n", " #mots[\"1_00_05\"]" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zWv6h_dQJFP1", "colab_type": "code", "colab": {} }, "source": [ "def structure_pm_filtre(data_child,ages):\n", " \"\"\"\n", " Décortique les données en mots puis en phonemes, \n", " tout en respectant la structure nécessaire pour la librairie pymining. \n", " En conservant uniquement les mots avec des phonèmes recherchés.\n", " \"\"\"\n", " # Sépare en mots au lieu de phrases\n", " mots = {}\n", " for i in list(ages):\n", " # recupere toutes les phrases a la i eme date\n", " phrases = data_child['contenu'].loc[i].values\n", " # creer une variable texte avec toutes les phrases\n", " phrase = ' '.join(phrases)\n", " # sépare en mot\n", " mots[i] = phrase.split()\n", " filtre = []\n", " for j in range(len(mots[i])):\n", " mots[i][j] = pho_filtre(mots[i][j])\n", " if mots[i][j] != None:\n", " filtre.append(idx_pos(mots[i][j]))\n", " mots[i] = tuple(filtre)\n", "\n", " return mots" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VBGlm0460Or7", "colab_type": "code", "colab": {} }, "source": [ "def structure_pm_age_group(data_child,ages):\n", " \"\"\"\n", " Décortique les données en mots puis en phonèmes avec des âges regroupés, \n", " tout en respectant la structure nécessaire pour la librairie pymining\n", " \"\"\"\n", " a_group = group_age(ages)\n", " age_unique = []\n", " rep_age = []\n", " for age in a_group:\n", " if age in age_unique:\n", " rep_age[age_unique.index(age)] +=1\n", " else:\n", " age_unique.append(age)\n", " rep_age.append(1)\n", " \n", " d = 0\n", " mots = {}\n", " ages = list(ages)\n", " for idx in range(len(rep_age)):\n", " fin = d+rep_age[idx]\n", " filtre_ages = ages[d:fin]\n", " a = age_unique[idx]\n", " mots[a] = []\n", " for i in filtre_ages : #groupe les ages\n", " # recupere toutes les phrases a la i eme date\n", " phrases = data_child['contenu'].loc[i].values\n", " # creer une variable texte avec toutes les phrases pour les annees regroupeées\n", " phrase = ' '.join(phrases)\n", " # sépare en mot\n", " mots[a] += phrase.split()\n", " for j in range(len(mots[a])):\n", " #mots[i][j] = [mots[i][j]] # pour etude du mot\n", " mots[a][j] = idx_pos(cut_pho(mots[a][j])) # pour etude des phonemes dans un mot indexé par position \n", " #mots[i][j] = list(mots[i][j]) # pour etude des lettres dans un mot\n", " mots[a] = tuple(mots[a])\n", " d = fin\n", "\n", " return mots,age_unique" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "vWHeFGdQCjLh", "colab_type": "text" }, "source": [ "### PATTERN MINING initialisation" ] }, { "cell_type": "code", "metadata": { "id": "w5ZyzbS9CjLj", "colab_type": "code", "colab": {} }, "source": [ "!pip install pymining\n", "from pymining import itemmining\n", "from pymining import assocrules\n", "from pymining import seqmining" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XzYmebub2Kka", "colab_type": "text" }, "source": [ "### Itemmining etude 1 : motif sequentiel par age par enfant" ] }, { "cell_type": "markdown", "metadata": { "id": "T5S0OY1YESkf", "colab_type": "text" }, "source": [ "#### Itemmining" ] }, { "cell_type": "code", "metadata": { "id": "OstVi_SquzZ0", "colab_type": "code", "colab": {} }, "source": [ "min_supp = 170 # Nombre de répétitions minimums requises du phonème" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FekFMhun9myn", "colab_type": "code", "colab": {} }, "source": [ "def item_mining(ages,mots):\n", " tab_items = []\n", " ages_save = []\n", " for age in ages:\n", " relim_input = itemmining.get_relim_input(mots[age])\n", " item_sets = itemmining.relim(relim_input, min_support=min_supp)\n", " if len(item_sets) != 0:\n", " ages_save.append(age)\n", " tab_item = pd.DataFrame([item_sets]).T.reset_index()\n", " tab_item.columns = ['item','nbr']\n", " \n", " tab_items.append(tab_item)\n", " ages = ages_save\n", " return tab_items,ages" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XWwugnd4X1aR", "colab_type": "text" }, "source": [ "#### Sunburst itemmining" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "_0tE3woXVswS" }, "source": [ "##### Exectution pour tout les enfants" ] }, { "cell_type": "code", "metadata": { "id": "DJUVBTO-1da8", "colab_type": "code", "colab": {} }, "source": [ "enfants = ['adrien','anae','antoine','julie','madeleine','theophile']\n", "for enfant in enfants:\n", " globals()['data_'+enfant] , globals()['ages_'+enfant] = load_data(enfant)\n", " globals()['mots_'+enfant] = structure_pm(globals()['data_'+enfant],globals()['ages_'+enfant]) #sans filtre sur les pho\n", " #globals()['mots_'+enfant] = structure_pm_filtre(globals()['data_'+enfant],globals()['ages_'+enfant]) # avec filtre\n", " globals()['tab_items_'+enfant] , globals()['ages_'+enfant] = item_mining(globals()['ages_'+enfant],globals()['mots_'+enfant])\n", " \n", " # faire attention a que len('tab_items_'+enfant) soit au min de 2\n", " print(enfant,len(globals()['tab_items_'+enfant]),'ok')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "dVXpkZ6MV9er", "colab_type": "text" }, "source": [ "##### Creation du fichier" ] }, { "cell_type": "code", "metadata": { "id": "PItAzPWfi3uO", "colab_type": "code", "colab": {} }, "source": [ "fichier = open(\"pattern.json\", \"w\")\n", "\n", "######################\n", "init = '{\"name\": \"Pattern Mining\",\\n\"children\": [\\n'\n", "fichier.write(init)\n", "\n", "######################\n", "for enfant in enfants:\n", " write_enfant ='{\"name\": \"' +enfant.capitalize()+ '\",\\n\"children\": [\\n'\n", " fichier.write(write_enfant)\n", " ages = globals()['ages_'+enfant]\n", " tab_items = globals()['tab_items_'+enfant]\n", " ######################\n", "\n", " idx = 0\n", " for age in ages:\n", " write_age ='{\"name\": \"' +print_age(age)+ '\",\\n\"children\": [\\n'\n", " fichier.write(write_age)\n", "\n", " tab_item = tab_items[idx]\n", " #print(idx,age,len(tab_item),len(tab_item) > 2)\n", "\n", " if len(tab_item[:-1]) != 0:\n", " for i in range(len(tab_item[:-1])):\n", " item,nbr = tab_item.loc[i]\n", " to_write = '{ \"name\": \"' +str(item)[11:-2]+ '\", \"value\": ' +str(nbr)+ ' },\\n'\n", " fichier.write(to_write)\n", "\n", " item,nbr = tab_item.loc[len(tab_item)-1] # -1 car l'incrementation commence a zero...\n", " if age != ages[-1]: # si ce n'est pas la derniere ligne du DERNIER age de cet enfant\n", " to_write = '{ \"name\": \"' +str(item)[11:-2]+ '\", \"value\": ' +str(nbr)+ '}\\n]\\n},\\n'\n", " fichier.write(to_write)\n", " idx += 1\n", " else: # si c'est la DERNIERE ligne du DERNIER age de cet enfant\n", " if enfant != enfants[-1]: # si c'est la DERNIERE ligne du DERNIER age mais pas du dernier enfant\n", " to_write = '{ \"name\": \"' +str(item)[11:-2]+ '\", \"value\": ' +str(nbr)+ '}\\n]\\n}\\n]\\n},\\n'\n", " fichier.write(to_write)\n", " else: # si c'est la DERNIERE ligne du DERNIER age du DERNIER enfant (donc la fin du doc)\n", " to_write = '{ \"name\": \"' +str(item)[11:-2]+ '\", \"value\": ' +str(nbr)+ '}\\n]\\n}\\n]\\n}\\n]\\n}'\n", " fichier.write(to_write)\n", "\n", "fichier.close()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "63_dhWcx2bUR", "colab_type": "text" }, "source": [ "### Assocrules étude 2 : quel phoneme en entraine un autre ?" ] }, { "cell_type": "markdown", "metadata": { "id": "I9SisLeaUrLj", "colab_type": "text" }, "source": [ "#### Assocrules" ] }, { "cell_type": "code", "metadata": { "id": "AXth-jLaU0tf", "colab_type": "code", "colab": {} }, "source": [ "min_supp = 100 # Nombre de répétitions minimums requises du phonème\n", "conf = 0.8 # Probabilité minimum requise pour dire que le phonème 1 entraine le phonème 2" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dxH-1U-mU04n", "colab_type": "code", "colab": {} }, "source": [ "def assoc_rules(ages,mots):\n", " tab_items = []\n", " ages_save = []\n", " for age in ages:\n", " relim_input = itemmining.get_relim_input(mots[age])\n", " item_sets = itemmining.relim(relim_input, min_support=min_supp)\n", " if len(item_sets) != 0:\n", " rules = assocrules.mine_assoc_rules(item_sets, min_support=min_supp, min_confidence=conf)\n", " if len(rules) != 0:\n", " ages_save.append(age)\n", " tab_item = pd.DataFrame(rules)\n", " tab_item.columns = ['si_pho1','alors_pho2','nbr','pourcent']\n", " tab_items.append(tab_item)\n", " ages = ages_save\n", " return tab_items,ages" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "1sGqNsoLmBAd", "colab_type": "text" }, "source": [ "#### Sankey assocrules" ] }, { "cell_type": "markdown", "metadata": { "id": "inFT3YTr1dHc", "colab_type": "text" }, "source": [ "##### Exectution pour tout les enfants" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "JKukGCJPWFEt", "colab": {} }, "source": [ "enfants = ['adrien','anae','antoine','julie','madeleine','theophile']\n", "for enfant in enfants:\n", " globals()['data_'+enfant] , globals()['ages_'+enfant] = load_data(enfant)\n", " #globals()['mots_'+enfant] = structure_pm(globals()['data_'+enfant],globals()['ages_'+enfant]) #sans filtre sur les pho\n", " globals()['mots_'+enfant] , globals()['ages_'+enfant] = structure_pm_age_group(globals()['data_'+enfant],globals()['ages_'+enfant]) #avec regroupement des ages\n", " globals()['tab_items_'+enfant] , globals()['ages_'+enfant] = assoc_rules(globals()['ages_'+enfant],globals()['mots_'+enfant])\n", " globals()['ages_'+enfant] = group_age(globals()['ages_'+enfant])\n", " \n", " # faire attention a que len('tab_items_'+enfant) soit au min de 2\n", " print(enfant,len(globals()['tab_items_'+enfant]),'ok')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "rAkM_oYzWTmq" }, "source": [ "##### Creation du fichier" ] }, { "cell_type": "code", "metadata": { "id": "PnUAG9XHxm9j", "colab_type": "code", "colab": {} }, "source": [ "enfants = ['adrien','anae','antoine','julie','madeleine','theophile',['adrien','anae','antoine','julie','madeleine','theophile']]\n", "enfants\n", "for enf in enfants:\n", " if enf != enfants[-1]:\n", " enf = [enf]\n", " source = []\n", " target = []\n", " value = []\n", " for enfant in enf:\n", " ages = globals()['ages_'+enfant]\n", " tab_items = globals()['tab_items_'+enfant]\n", " for i in range(len(tab_items)):\n", " source.append(enfant.capitalize())\n", " target.append(print_age(ages[i]))\n", " value.append(tab_items[i].nbr.sum())\n", " for j in range(len(tab_items[i])):\n", " source.append(print_age(ages[i]))\n", " target.append(str(tab_items[i].si_pho1[j])[11:-2]+' ')\n", " value.append(tab_items[i].nbr[j])\n", "\n", " source.append(str(tab_items[i].si_pho1[j])[11:-2]+' ')\n", " target.append(str(tab_items[i].alors_pho2[j])[11:-2])\n", " value.append(tab_items[i].nbr[j])\n", "\n", " data = pd.DataFrame([source,target,value]).T\n", " data.columns = ['source','target','value']\n", " #data.head()\n", "\n", " if enf != enfants[-1]:\n", " os.chdir('/content/SANKEY')\n", " data.to_csv('data_'+enf[0]+'.csv',\n", " encoding = 'utf-8',\n", " sep = ',',\n", " header = True)\n", " else:\n", " os.chdir('/content/SANKEY')\n", " data.to_csv('data_sankey.csv',\n", " encoding = 'utf-8',\n", " sep = ',',\n", " header = True)\n", " print(enf)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "VyHD2enDPUMs", "colab_type": "text" }, "source": [ "### Itemmining etude 3 : ordre d'apparition des phonemes" ] }, { "cell_type": "markdown", "metadata": { "id": "x5iMxfUhY9im", "colab_type": "text" }, "source": [ "#### Structuration des données" ] }, { "cell_type": "code", "metadata": { "id": "QUMn2hhGY5yH", "colab_type": "code", "colab": {} }, "source": [ "def structure_pm_pho(data_child,ages):\n", " \"\"\"\n", " Décortique les données en mots puis en phonèmes indexé, \n", " et le ramène au niveau du phoneme pour la comparaison future, \n", " tout en respectant la structure nécessaire pour la librairie pymining\n", " \"\"\"\n", " # Sépare en mots au lieu de phrases\n", " phos = {}\n", " for i in list(ages):\n", " # recupere toutes les phrases a la i eme date\n", " phrases = data_child['contenu'].loc[i].values\n", " # creer une variable texte avec toutes les phrases\n", " phrase = ' '.join(phrases)\n", " # sépare en mot\n", " phos[i] = phrase.split()\n", " pho = []\n", " for j in range(len(phos[i])):\n", " pho += idx_pos(cut_pho(phos[i][j]))\n", " pho = ' '.join(pho)\n", " phos[i]=pho.split()\n", " pho = []\n", " for i in list(phos[i]):\n", " pho.append([i])\n", " phos[i]=tuple(pho)\n", "\n", " return phos\n", "###########################\n", "def crea_corpus():\n", " \"\"\"\n", " Concatène toutes les data des infos préalablement\n", " structurées avec la fonction structure_pm_pho\n", " \"\"\"\n", " data=[]\n", " for enfant in enfants:\n", " t = 160 # Permettra de creer 160*6enfants enfants ce qui fait 960seéquences\n", " a= len(globals()['ages_'+enfant])\n", " print('print1',enfant,a)\n", " \n", " for age in globals()['ages_'+enfant]:\n", " if len(globals()['phos_'+enfant][age])-t < t:\n", " del globals()['phos_'+enfant][age]\n", " idx = globals()['ages_'+enfant].index(age)\n", " del globals()['ages_'+enfant][idx]\n", " a -= 1\n", " print('print2',enfant,a)\n", " enfants_fictif = [[]]*t\n", " for age in globals()['ages_'+enfant]:\n", " for i in range(t):\n", " nbr = len(globals()['phos_'+enfant][age])%t\n", " if globals()['ages_'+enfant].index(age) == 0:\n", " enfants_fictif[i] = globals()['phos_'+enfant][age][:t]\n", " else :\n", " enfants_fictif[i] += globals()['phos_'+enfant][age][t*i:t*(i+1)]\n", " data += enfants_fictif\n", " return data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "PqJpfQA6HtmM", "colab_type": "code", "colab": {} }, "source": [ "enfants = ['adrien','anae','antoine','julie','madeleine','theophile']\n", "for enfant in enfants:\n", " globals()['data_'+enfant] , globals()['ages_'+enfant] = load_data(enfant)\n", " globals()['phos_'+enfant] = structure_pm_pho(globals()['data_'+enfant],globals()['ages_'+enfant])\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OwxUWlxerWkX", "colab_type": "code", "colab": {} }, "source": [ "data = crea_corpus()\n", "min_supp = 960\n", "freq_seqs = seqmining.freq_seq_enum(data,min_supp)\n", "\n", "print(len(freq_seqs))\n", "\n", "itemsets = []\n", "for seq in freq_seqs:\n", " a,b=seq\n", " itemsets.append(a)\n", "\n", "itemsets = pd.DataFrame(itemsets)\n", "itemsets = itemsets.reset_index()\n", "del itemsets['index']\n", "itemsets = itemsets.sort_values(by = [0,1,3,4])\n", "\n", "for i in range(len(itemsets.columns)):\n", " for j in range(len(itemsets)):\n", " if type(itemsets[i][j]) == str :\n", " itemsets[i][j] +=' '*i\n", "\n", "itemsets.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "r1jsg_tuRCiu", "colab_type": "text" }, "source": [ "#### Visu 1 Tangled tree (echec car trop de donnnees)" ] }, { "cell_type": "code", "metadata": { "id": "8l9ksOf5t_Ay", "colab_type": "code", "colab": {} }, "source": [ "levels=[]\n", "for i in range(len(itemsets.columns)):\n", " level = []\n", " if i == 0:\n", " for j in range(len(itemsets[i])):\n", " if type(itemsets[i][j]) == str :\n", " level.append(\"{id:'\"+ itemsets[i][j] +\"'}\")\n", " else :\n", " for j in range(len(itemsets[i])):\n", " if type(itemsets[i][j]) == str :\n", " level.append(\"{id:'\"+ itemsets[i][j] +\"', parents:\"+ str(list(filter(None,list(itemsets.T[:i][j])))) +'}')\n", " levels.append(level)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "v8Ifq28NvGwa", "colab_type": "code", "colab": {} }, "source": [ "for i in range(len(levels)):\n", " levels[i]=sorted(set(levels[i]))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ca96spqWyFQd", "colab_type": "code", "colab": {} }, "source": [ "levels = str(levels).replace('\"','').replace(\"}}\",\"}\").replace(\"{{\",\"{\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ARBQSat9FE3v", "colab_type": "code", "colab": {} }, "source": [ "fichier = open(\"visu.json\", \"w\")\n", "fichier.write(levels)\n", "fichier.close()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "pGQF0yHURVrR", "colab_type": "text" }, "source": [ "#### Visu 2 Sankey" ] }, { "cell_type": "code", "metadata": { "id": "vlfrz2e3MOXO", "colab_type": "code", "colab": {} }, "source": [ "source = []\n", "target = []\n", "for i in range(1,len(itemsets.columns)):\n", " for j in range(len(itemsets)):\n", " if type(itemsets[i][j]) == str:\n", " source.append(itemsets[i-1][j])\n", " target.append(itemsets[i][j])\n", "\n", "value = [1]*len(source)\n", "data = pd.DataFrame([source,target,value]).T\n", "data.columns = ['source','target','value']" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "YYlW0wNMOG4b", "colab_type": "code", "colab": {} }, "source": [ "os.chdir('/content/')\n", "data.to_csv('data_etude3_'+str(min_supp)+'_'+str(len(freq_seqs))+'.csv',\n", " encoding = 'utf-8',\n", " sep = ',',\n", " header = True)" ], "execution_count": 0, "outputs": [] } ] }