Койляк Евгений Андреевич

Size: px

Start display at page:

Download "Койляк Евгений Андреевич"

Samantha Carter
5 years ago
Views:

Работы победителя заключительного этапа командной инженерной олимпиады школьников Олимпиада Национальной технологической инициативы Профиль «Большие данные и машинное обучение» Койляк Евгений

1 Работы победителя заключительного этапа командной инженерной олимпиады школьников Олимпиада Национальной технологической инициативы Профиль «Большие данные и машинное обучение» Койляк Евгений Андреевич Класс: 11 Школа: ГБОУ Школа-интернат «Интеллектуал» Уникальный номер участника: 33 Параллель: класс Город: г. Москва Регион: г. Москва Команда на заключительном этапе: Кирпикла Результаты заключительного этапа: Индивидуальная часть Командный результат Математика Информатика Результат Итого 1-й день 2-й день 3-й день 1а 1б 1в

2 Индивидуальная часть Персональный лист участника с номером 33:

3 Лист 1: Математика

4 Лист 1 (обратная сторона): Оценка за задачу 1а: 1 Комментарий к решению: Задача решена верно. Оценка за задачу 1б: 3 Комментарий к решению: Задача решена верно Оценка за задачу 1в: 0 Комментарий к решению: Задача решена не верно Оценка за задачу 2: 0 Комментарий к решению: Задача решена не верно.

6 Информатика Задача 1 Задача решена (1 балл). Код программы на языке Python, написанный участником и решающий задачу: 1. n,m = map(int, input().split()) 2. a = list(map(int, input().split())) 3. b = list(map(int, input().split())) 4. a.sort() 5. a.reverse() 6. b.sort() 7. b.reverse() 8. for i in range(min(m,n)): 9. print(a[i], b[i]) 10. if m<n: 11. for i in range(m, n): 12. print(a[i], -1) Задача 2 Задача решена (1 балл). Код программы на языке Python, написанный участником и решающий задачу: import json 3. import math him = clas = 5 7. dop = vozr = n = int(input()) 11. ma = cou = su = ans = [] 15. for i in range(n): 16. v = input().split(', ') 17. b = [int(v[clas]), int(v[him]), int(v[dop]), int(v[vozr])] 18. if b[0] == 8 or b[0] == 9: 19. if b[1] > ma: 20. ma = b[1] 21. ans = v[0][2:-1] + ' ' + v[1][1:-1] 22. if b[2]!= 0: 23. su+= b[3] 24. cou+=1 25. if cou == 0: 26. print("no answer") 27. else: 28. print(su//cou) 29. if ma == -1: 30. print("no answer") 31. else: 32. print(ans)

7 Задача 3 Задача решена (3 балла). Код программы на языке Python, написанный участником и решающий задачу: 1. n,m = map(int, input().split()) 2. mas = [] 3. color = [0]*n 4. for i in range(n): 5. mas.append([]) 6. for i in range(m): 7. a,b = map(int, input().split()) 8. a-=1 9. b-=1 10. mas[a].append(b) 11. mas[b].append(a) def dfs(i, su2): 15. global color 16. color[i] = index 17. for j in mas[i]: 18. if color[j] == 0: 19. su2+= dfs(j, 0) 20. su2+=1 21. return su index = su = [] 25. for i in range(n): 26. if color[i] == 0: 27. index+=1 28. su.append(dfs(i, 0)) print(index) 31. for i in range(0, index): 32. print('['+str(su[i]+1) + ']', end = ' ') 33. for j in range(n): 34. if color[j] == i+1: 35. print(j+1, end=' ') 36. print() Задача 4 Задача решена (3 балла). Код программы на языке Python, написанный участником и решающий задачу: 1. k = int(input()) 2. m = list(map(int, input().split())) 3. alle = [0,0,0,0,0] 4. for i in range(100): 5. v = list(map(int, input().split())) 6. for j in range(5): 7. alle[j] += v[j] 8. def func(x, y): 9. if y == 1: 10. return x 11. else: 12. return 1-x 13. for i in range(5): 14. alle[i]/=

8 16. ans_vero = for i1 in range(2): 18. for i2 in range(2): 19. for i3 in range(2): 20. for i4 in range(2): 21. for i5 in range(2): 22. per = i1*m[0]+ i2*m[1] + i3*m[2] + i4*m[3] + i5*m[4] 23. if per >= k: 24. ans_vero+= func(alle[0], i1) *func(alle[1], i2) * func(alle[2], i3) * func(alle[3], i4) * func(alle[4], i5) 25. print(int(ans_vero* )) Задача 5 Задача решена (4 балла). Код программы на языке Python, написанный участником и решающий задачу: 1. color_dfs = [] 2. color = [] def dfs(i, t): 5. global color, color_dfs 6. color_dfs[i] = False 7. color[i] = t 8. for wqwe, x,y in vzat: 9. if x == i and color_dfs[y]: 10. dfs(y, t) 11. if y == i and color_dfs[x]: 12. dfs(x,t) n,m = list(map(int, input().split())) 15. sp_reb = [] for i in range(m): 18. a,b,w = list(map(int, input().split())) 19. a-=1 20. b-=1 21. sp_reb.append([w,a,b]) color = [0]*n 25. for i in range(n): 26. color[i] = i i = sp_reb.sort() 31. vzat = [] 32. index = while index < n-1: 34. if color[sp_reb[i][1]]!= color[sp_reb[i][2]]: 35. index+=1 36. vzat.append(sp_reb[i]) 37. color_dfs = [True]*n 38. dfs(sp_reb[i][1], color[sp_reb[i][1]]) 39. i+=1 40. su2 = for i in range(n-1): 42. su2 += vzat[i][0] 43. print(su2)

9 Командная часть Результаты были получены в рамках выступления команды: Кирпикла. Личный состав команды: Койляк Евгений Андреевич Коноваленко Даниил Дмитриевич Шумилов Кирилл Дмитриевич Результаты работы в командном этапе: Задача Задача 1 Задача 2 Задача 3 Результат Далее представлен код команды по задачам. Код был написан на языке Python, обрабатывал предоставленный организаторами набор данных. Задача 1 # coding: utf-8 # In[27]: from GraphParser import graphparser as gp from matplotlib import pyplot as plt import matplotlib import Task1Base as tb import os import pandas as pd # In[18]: graph_columns = ["from", "to", "links", "mask"] demography_columns = ["userid", "birth_date"] # In[19]: graph_folder_path = os.path.join("task1", "Task1", "graph") demography_folder_path = os.path.join("task1", "Task1", "traindemography") # In[20]: graph_lines = 5000 demography_lines = 5000 # In[21]: graph_fd = "" demography_fd = "" # In[22]:

10 graph_index_col = "from" demography_index_col = "userid" # In[24]: train_graph, graph_fd = gp.parsefolderbyschema(graph_folder_path, graph_lines, graph_fd, graph_index_col, graph_columns, True) # In[25]: train_demography, demography_fd = gp.parsefolderbyschema(demography_folder_path, demography_lines, demography_fd, demography_index_col, demography_columns, True) # In[26]: train_demography_df = pd.dataframe(list(train_demography.items()), columns = ["id", "age"]) train_demography_df.index = train_demography_df['id'] train_demography_df = train_demography_df[['age']] # In[28]: train_demography_df.head() # In[29]: from sklearn.linear_model import LinearRegression linear_regression = LinearRegression() # In[12]: from Task1Base import prediction_function # In[14]: get_ipython().magic('load_ext autoreload') # In[17]: autoreload 2 # In[19]: prediction_function(train_demography, train_graph)

11 Задача 2 import pickle from collections import Counter, defaultdict import pandas as pd def get_age(user_id, df): return int(df.loc[user_id]['birth_date']) def get_location(user_id, df): return int(df.loc[user_id]['id_location']) def get_country(user_id, df): return int(df.loc[user_id]['id_country']) graph = pickle.load(open('graph_.pkl', 'rb')) df = pickle.load(open('demog_pandas.pkl', 'rb')) know_loc_y = pickle.load(open('know_loc_y.pkl', 'rb')).iloc[:5000] know_loc_ids = know_loc_y.index.values print(len(know_loc_ids)) friends_of_y = dict() for i in know_loc_ids: friends_of_y.setdefault(i, graph[i]) missed = list() features = dict() i = 0 for user, friends in friends_of_y.items(): print(user) ages = list() frequent_location = Counter() frequent_country = Counter() if isinstance(friends, list): for friend in friends: try: age = get_age(friend, df) ages.append(age) try: frequent_location[get_location(friend, df)] += 1 except TypeError: continue frequent_country[get_country(friend, df)] += 1 except KeyError: print('check this guy', friend) continue else: try: age = get_age(friends, df) ages.append(age) try: frequent_location[get_location(friends, df)] += 1 except TypeError: print('check this guy', friends) missed.append(user) continue frequent_country[get_country(friend, df)] += 1 except KeyError: missed.append(user) continue features.setdefault(user, [sum(ages)/len(ages), frequent_location.most_common(1)[0][0], frequent_country.most_common(1)[0][0]]) print(i)

12 i += 1 features = pd.dataframe.from_dict(features, orient='index') features.to_pickle('features.pkl') print(missed, file=open('missed.txt', 'w')) print(know_loc_y) print(features) import pickle graph = pickle.load(open('graph.pkl', 'rb')) locs = pickle.load(open('locs.pkl', 'rb')) file = open('ids_to_predict.txt', 'w') for user in graph.keys(): try: print('{} - {}'.format(user, locs[user])) except KeyError: print('{} - location does not exists'.format(user)) print(user, file=file) file.close() from in_memory import get_friends, get_location def mask_open(mask): mask = int(mask) opened = bin(mask)[2:] opened = '0' + opened[-1::-1] return opened def get_relationships(mask: int) -> set: result = set() relationships = {1: 'Love', 2: 'Spouse', 3: 'Parent', 4: 'Child', 5: 'Brother/Sister', 6: 'Uncle/Aunt', 7: 'Relative', 8: 'Close friend', 9: 'Colleague', 10: 'Schoolmate', 11: 'Nephew', 12: 'Grandparent', 13: 'Grandchild', 14: 'College/University fellow', 15: 'Army fellow', 16: 'Parent in law', 17: 'Child in law', 18: 'Godparent', 19: 'Godchild', 20: 'Playing together', 21: ''} mask = mask_open(mask) for index, bit in enumerate(mask): if bit == '1': result.add(relationships[index]) return result def is_representative(mask: int): set_mask = get_relationships(mask)

13 non_represent = {'Army fellow', 'Playing together'} very_represent = {'Love', 'Spouse', 'Parent', 'Child'} if not set_mask: return 1 if non_represent & set_mask: return 0.1 if very_represent & set_mask: return 7 return 1 def jaccard_from_kailiak(userid_1: int, userid_2: int) -> float: # user1 is user for which we make a prediction neighborhood_1 = set(map(lambda x: x[0], get_friends(userid_1))) neighborhood_2 = set(map(lambda x: x[0], get_friends(userid_2))) c_friends = neighborhood_1 & neighborhood_2 try: return len(c_friends) / len(neighborhood_1) except: return 0 def jaccard_coefficient(userid_1: int, userid_2: int) -> float: neighborhood_1 = set(map(lambda x: x[0], get_friends(userid_1))) neighborhood_2 = set(map(lambda x: x[0], get_friends(userid_2))) c_friends = neighborhood_1 & neighborhood_2 all_friends = neighborhood_1 neighborhood_2 return len(c_friends) / len(all_friends) def list_nearest(user_id: int, friends: list) -> list: # return list with shape (count_of_friends, 2), where for every friend in descreasing order one line contains coefficient and location of friend answers = list() for friend_id, friend_mask in friends: result_friend = (1 + jaccard_from_kailiak(user_id, friend_id)) * is_representative(friend_mask) answers.append((result_friend, get_location(friend_id))) answers.sort() answers.reverse() return answers def k_nearest(user_id, friends, k): list_1 = list_nearest(user_id, friends) list_k = list_1[:max(k, len(list_1))] dict = {} for x, y in list_k: dict.setdefault(y, 0) dict[y] += 1 ma = -1 ma_ind = -1 for ind, value in dict.items(): if ma < value: ma = value ma_ind = ind return ma_ind def nearest(user_id, friends): return k_nearest(user_id, friends, 1) import pickle import os.path

14 pickled_graph = pickle.load(open(os.path.join('..', 'res', 'graph.pkl'), 'rb')) pickled_demography = pickle.load(open(os.path.join('..', 'res', 'locs_.pkl'), 'rb')) def get_friends(user_id): result = pickled_graph.get(user_id, []) try: if type(result[0])!= list: return [result] return result except: return result def get_location(user_id): return int(pickled_demography.get(user_id, 0)) Задача 3 # coding: utf-8 # In[55]: import csv, gzip, os, glob import pandas as pd import numpy as np import scipy as sp from scipy.sparse import coo_matrix, csr_matrix # In[56]: def jaccard_score(common_friends_matrix: csr_matrix, from_user:int, to_user: int) -> float: score = (common_friends_matrix[from_user, to_user] / (common_friends_matrix[from_user, from_user] + common_friends_matrix[to_user, to_user] - common_friends_matrix[from_user, to_user])) if score == sp.inf: return 0.0 else: return score # In[75]: def jaccard_feature(withdemography, commonfriends_csr): jaccard_array = np.zeros_like(withdemography["from"].values, dtype=np.float32) count = 0 for index, row in withdemography.iterrows(): if count % 1000 == 0: print(count) from_id = row['from'] to_id = row['to'] jaccard_array[count] = jaccard_score(commonfriends_csr, from_id, to_id) count += 1 withdemography["jaccard"] = jaccard_array del jaccard_array

15 # In[76]: def dgender_feature(withdemography): dgender_array = np.zeros_like(withdemography["from"].values, dtype=np.float32) count = 0 for index, row in withdemography.iterrows(): if count % 1000 ==0: print(count) if row['gender'] == row['gender_to']: dgender_array[count] += 1 count += 1 withdemography['dgender'] = dgender_array del dgender_array # In[77]: def dloc_feature(withdemography): dloc_array = np.zeros_like(withdemography["from"].values, dtype=np.float32) count = 0 for index, row in withdemography_train.iterrows(): if count % 1000 ==0: print(count) if row['id_country'] == row['id_country_to']: dloc_array[count] += 1 count += 1 withdemography['dloc'] = dloc_array del dloc_array # In[78]: core = pd.dataframe() for f in glob.glob("./coredemography/part*"): # Собираем все в один датафрэйм print(f) core = pd.concat( [core, # Поддержка csv из коробки :) pd.read_csv( f, sep = "\t", # Поддержка gzip из коробки :) compression = 'gzip', # Заводим индекс - будет быстрый доступ по ИД юзера index_col = ["userid"], names = ["userid", "create_date", "birth_date", "gender", "ID_country", "ID_Location", "loginregion"], dtype = { "userid" : np.int32, "create_date" : np.int64, "birth_date" : np.float16, "gender" : np.int8, "ID_country" : np.int64, "ID_Location" : np.int64, # Поскольку поле может быть пусто, используем float. На месте пустоты будет NaN "loginregion" : np.float32 } )])

16 # In[79]: datapath = "./" # In[80]: # Загрузка тренниночовой выборки общих друзей loaded = np.load(os.path.join(datapath, 'commonfriends_train.npz')) commonfriends_train_csr = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']) commonfriends_train = commonfriends_train_csr.tocoo() # Загрузка тренниночовой выборки связей loaded = np.load(os.path.join(datapath,'train_markup.npz')) train_markup = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']).tocoo() train_markup_df = pd.dataframe(data = {"from" : train_markup.row, "to" : train_markup.col, "label" : train_markup.data}) train_common_friends_df = pd.dataframe(data = {"from" : commonfriends_train.row, "to" : commonfriends_train.col, "common_neighbors" : commonfriends_train.data}) del loaded # In[81]: positives_train = pd.merge( train_markup_df, train_common_friends_df, how='inner', left_on = ["from","to"], right_on = ["from","to"]) sample_neg_train = train_common_friends_df.sample(frac=0.01, replace=true) del train_common_friends_df pre_negatives_train = pd.merge( sample_neg_train, train_markup_df, how='left', left_on = ["from","to"], right_on = ["from","to"]) negatives_train = pre_negatives_train[pre_negatives_train["label"]!= 1].replace(float("NaN"), 0.0) dataset_train = pd.concat([positives_train, negatives_train]) # In[82]: del positives_train del negatives_train del sample_neg_train del train_markup_df # In[83]:

17 withdemography_train = pd.merge( pd.merge( dataset_train, core, how='inner', left_on = ["from"], right_index = True), core, how='inner', left_on = ["to"], right_index = True, suffixes = ['',"_to"]) # In[84]: del withdemography_train['create_date'] del withdemography_train['create_date_to'] # In[85]: jaccard_feature(withdemography_train, commonfriends_train_csr) dgender_feature(withdemography_train) dloc_feature(withdemography_train) # In[86]: # Загрузка тестовой выборкиобщих друзей loaded = np.load(os.path.join(datapath, 'commonfriends_test.npz')) commonfriends_test_csr = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']) commonfriends_test = commonfriends_test_csr.tocoo() # Загрузка тестовой выборки связей loaded = np.load(os.path.join(datapath, 'test_markup.npz')) test_markup = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']).tocoo() test_markup_df = pd.dataframe(data = {"from" : test_markup.row, "to" : test_markup.col, "common_neighbors" : test_markup.data}) test_common_friends_df = pd.dataframe(data = {"from" : commonfriends_test.row, "to" : commonfriends_test.col, "common_neighbors" : commonfriends_test.data}) del loaded # In[24]: positives_test = pd.merge( test_markup_df, test_common_friends_df, how='inner', left_on = ["from","to"], right_on = ["from","to"]) del test_common_friends_df pre_negatives_test = pd.merge( sample_neg_test,

18 test_markup_df, how='left', left_on = ["from","to"], right_on = ["from","to"]) sample_neg_test = test_common_friends_df.sample(frac=0.01, replace=true) negatives_test = pre_negatives_test[pre_negatives_test["label"]!= 1].replace(float("NaN"), 0.0) dataset_test = pd.concat([positives_test, negatives_test]) del positives_test del negatives_test del sample_neg_test del train_markup_df_test del withdemography_test['create_date'] del withdemography_test['create_date_to'] withdemography_test = pd.merge( pd.merge( dataset_test, core, how='inner', left_on = ["from"], right_index = True), core, how='inner', left_on = ["to"], right_index = True, suffixes = ['',"_to"]) jaccard_feature(withdemography_test) dgender_feature(withdemography_test) dloc_feature(withdemography_test) # Загрузка валидационной выборки общих друзей loaded = np.load(os.path.join(datapath, 'commonfriedns_validation.npz')) commonfriends_valid_csr = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']) commonfriends_valid = commonfriends_valid_csr.tocoo() # Загрузка валидационной выборки связей loaded = np.load(os.path.join(datapath, 'validation_marup.npz')) valid_markup = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape']).tocoo() valid_common_friends_df = pd.dataframe(data = {"from" : commonfriends_train.row,

19 commonfriends_train.data}) del loaded "to" : commonfriends_train.col, "common_neighbors" : positives_valid = pd.merge( valid_markup_df, valid_common_friends_df, how='inner', left_on = ["from","to"], right_on = ["from","to"]) del valid_common_friends_df pre_negatives_valid = pd.merge( sample_neg_test, valid_markup_df, how='left', left_on = ["from","to"], right_on = ["from","to"]) sample_neg_valid = valid_common_friends_df.sample(frac=0.01, replace=true) negatives_valid = pre_negatives_valid[pre_negatives_valid["label"]!= 1].replace(float("NaN"), 0.0) dataset_valid = pd.concat([positives_valid, negatives_valid]) del positives_valid del negatives_valid del sample_neg_test del train_markup_df_valid withdemography_valid = pd.merge( pd.merge( dataset_valid, core, how='inner', left_on = ["from"], right_index = True), core, how='inner', left_on = ["to"], right_index = True, suffixes = ['',"_to"]) del withdemography_valid['create_date'] del withdemography_valid['create_date_to']

20 jaccard_feature(withdemography_valid) dgender_feature(withdemography_valid) dloc_feature(withdemography_valid) withdemography_train.to_pickle('withdemography_train.pkl') withdemography_test.to_pickle('withdemography_test.pkl') withdemography_valid.to_pickle('withdemography_valid.pkl') y_train, X_train = withdemography_train['label'].values, withdemography_train[fields].values y_test, X_test = withdemography_test['label'].values, withdemography_test[fields].values y_valid, X_valid = withdemography_train['label'].values, withdemography_train[fields].values from sklearn.ensemble import RandomForestClassifier fields = ['jaccard'] clf = RandomForestClassifier() clf.fit(x_train, y_train) clf.score(x_test, y_test) from sklearn.externals import joblib joblib.dump(clf, 'randforest.pkl') import pickle X_valid = pickle.load(open('x_valid.pkl', 'rb'))

21 answer = clf.predict_proba(x_valid[fields].values) val = X_valid[['from', 'to']] val = X_valid[['from', 'to']] val['is_friend'] = np.array([l[1] for l in answer]) val['not_friend'] = np.array([l[0] for l in answer]) val = val.sort_values(['from', 'not_friend']) val[val['from']==2405] from sklearn.neighbors import KNeighborsClassifier clf1 = KNeighborsClassifier(n_neighbors=40, weights='distance') clf1.fit(x, y) clf1.score(x_test, y_test) answer = clf1.predict_proba(x_valid[fields]) val = X_valid[['from', 'to']] val['is_friend'] = np.array([l[1] for l in answer]) val['not_friend'] = np.array([l[0] for l in answer]) val = val.sort_values(['from', 'not_friend']) X_valid = X_valid.sort_values(['from', 'jaccard'], ascending=[true, False])

22 X_valid X_valid clf.score(x_test, y_test) ids = np.array(sorted(set(x_valid['from'].values))) import csv with open('result.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') for user_id in ids: if X_valid['label']==1: continue top = X_valid[X_valid['from']==user_id] top_users = top['to'].values writer.writerow([user_id] + list(top_users))

Русинович Андрей Сергеевич

Работа победителя заключительного этапа командной инженерной олимпиады школьников Олимпиада Национальной технологической инициативы Профиль «Водные робототехнические системы» Русинович Андрей Сергеевич