TeorinKim / NetworkAnalysis

0 stars 0 forks source link

TourAPI 데이터 가져오기 #15

Open TeorinKim opened 2 years ago

TeorinKim commented 2 years ago

import pandas as pd import numpy as np

from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://data.visitkorea.or.kr/sparql") def getresult1(r2): prefix = """ PREFIX skos: http://www.w3.org/2004/02/skos/core# PREFIX rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# PREFIX rdfs: http://www.w3.org/2000/01/rdf-schema# PREFIX dc: http://purl.org/dc/elements/1.1/ PREFIX owl: http://www.w3.org/2002/07/owl# PREFIX xsd: http://www.w3.org/2001/XMLSchema# PREFIX vi: http://www.saltlux.com/transformer/views# PREFIX kto: http://data.visitkorea.or.kr/ontology/ PREFIX ktop: http://data.visitkorea.or.kr/property/ PREFIX ids: http://data.visitkorea.or.kr/resource/ PREFIX wgs: http://www.w3.org/2003/01/geo/wgs84_pos# PREFIX foaf: http://xmlns.com/foaf/0.1/ PREFIX geo: http://www.saltlux.com/geo/property# PREFIX pf: http://www.saltlux.com/DARQ/property# """

select_state = """
                    SELECT ?label ?location ?mapx ?mapy ?address ?cat ?subcat ?category ?descript
                    WHERE {
                      ?id rdfs:label ?label ;
                          ktop:location ?location ;
                          wgs:long ?mapx ;
                          wgs:lat ?mapy ;
                          ktop:address ?address ;
                          ktop:category ?category ;
                          dc:description ?descript.

                     ?location skos:broader ?upper_loc .
                     ?upper_loc skos:prefLabel ?upper_name .
                     FILTER langMatches( lang(?upper_name), "KO") .  
                     FILTER (contains( ?upper_name, '""" + r2 + """')) .

                     ?upper_loc skos:broader ?top_loc .
                     ?top_loc skos:prefLabel ?top_name .
                     FILTER langMatches (lang(?top_name), "KO" ) .
                     FILTER (contains( ?top_name, '""" + '제주' + """')) .

                    ?category skos:broader ?upper_category .
                    ?category rdfs:label ?subcat.
                    ?upper_category skos:broader ?top_category .
                    ?top_category skos:prefLabel ?cat .
                    FILTER langMatches( lang(?cat), "KO" )  .

                      } 
    """

sparql.setQuery(prefix + select_state)
sparql.setReturnFormat(JSON)
result = sparql.query().convert()

return result

result1 = getresult1('제주') result2 = getresult1('서귀포')

len(result2['results']['bindings'])

result2

tourapi1=[] data = result1['results']['bindings']

for i in range(len(data)): temp=[] templabel = data[i]['label']['value'] x=float(data[i]['mapx']['value']) y=float(data[i]['mapy']['value']) temp.append(templabel) temp.append(data[i]['cat']['value']) temp.append(data[i]['subcat']['value']) temp.append(y) temp.append(x) temp.append(data[i]['address']['value']) temp.append(data[i]['descript']['value']) check = str(templabel) + " " + str(x) + " " + str(y) temp.append(check) tourapi1.append(temp)

tourapi2=[] data = result2['results']['bindings']

for i in range(len(data)): temp=[] templabel = data[i]['label']['value'] x=float(data[i]['mapx']['value']) y=float(data[i]['mapy']['value']) temp.append(templabel) temp.append(data[i]['cat']['value']) temp.append(data[i]['subcat']['value']) temp.append(y) temp.append(x) temp.append(data[i]['address']['value']) temp.append(data[i]['descript']['value']) check = str(templabel) + " " + str(x) + " " + str(y) temp.append(check) tourapi1.append(temp)

api_base = tourapi1 + tourapi2

len(api_base)

api_base

col=['name','cat1','cat3','mapy','mapx','address','description' , 'gps_key']

apibase=pd.DataFrame(api_base,columns=col)

apibase

apibase.to_excel('D:/ML/prepost/apibase_0910.xlsx', index=False)

pp_raw=pd.read_excel('D:/ML/prepost/에디트립여행정보데이터.xlsx')

pp_raw

pp_raw_list = pp_raw.values.tolist()

pp_raw_list

key_pp_raw = [] key1_pp = [] key2_pp = []

for i in range(len(pp_raw)): name = pp_raw['지점명'][i] name = name.replace(" ","") address = pp_raw['신주소'][i] y = float(pp_raw['좌표(lat)'][i]) x = float(pp_raw['좌표(lon)'][i]) if address.find("제주특별자치도 ") == -1: address2 = "제주특별자치도 " + address else: address2 = address key1 = name + " " + str(round(y,8)) + " " + str(round(x,8)) key2 = name + " " + address2 key1_pp.append(key1) key2_pp.append(key2) temp = [key1, key2] key_pp_raw.append(temp)

pp_raw['key_add'] = key2_pp pp_raw['key_gps'] = key1_pp

pp_raw

matched_pp = [] matched_pp_time = [] matched_pp_closed = [] matched_pp_park = [] matched_pp_descripion = [] m_check_pp = []

for gps, add in key_pp_raw: timetemp = "" closedtemp = "" parktemp = "" descriptiontemp = "" matching_check = "not matched"

for name, cat, address, mapy, mapx, time, closed, park, description, address_key, gps_key in api_add:

    if gps==gps_key:
        timetemp = time
        closedtemp = closed
        parktemp = park
        descriptiontemp = description
        matching_check = "matched"
    elif add==address_key:
        timetemp = time
        closedtemp = closed
        parktemp = park
        descriptiontemp = description
        matching_check = "matched"

temp = [gps, add, timetemp, closedtemp, parktemp, descriptiontemp]

m_check_pp.append(matching_check)
matched_pp.append(temp)

m_check_pp

matched_api = [] m_check_api = []

for name, cat, address, mapy, mapx, time, closed, park, description, address_key, gps_key in api_add:

timetemp = ""
closedtemp = ""
parktemp = ""
descriptiontemp = ""
matching_check = "not matched"

for gps, add in key_pp_raw:

    if gps==gps_key:
        timetemp = time
        closedtemp = closed
        parktemp = park
        descriptiontemp = description
        matching_check = "matched"
    elif add==address_key:
        timetemp = time
        closedtemp = closed
        parktemp = park
        descriptiontemp = description
        matching_check = "matched"

temp = [gps_key, address_key, timetemp, closedtemp, parktemp, descriptiontemp]

m_check_api.append(matching_check)
matched_api.append(temp)

m_check_api

apibase['m_check']=m_check_api

pp_raw['m_check']=m_check_pp

apibase

api_m = apibase[apibase['m_check']=='not matched']

api_m

##########################

kc_493=pd.read_csv('D:/ML/prepost/KC_493_DMSTC_MCST_CAFE_DSRT_2020.csv') kc_495=pd.read_csv('D:/ML/prepost/KC_495_LLR_ATRCTN_2020.csv') kc_505=pd.read_csv('D:/ML/prepost/KC_505_DMSTC_MCST_RSTRT_2020.csv') kc_507=pd.read_csv('D:/ML/prepost/KC_507_LLR_HISTST_2020.csv')

kc_493.insert(0, "구분", "kc_493") kc_495.insert(0, "구분", "kc_495") kc_505.insert(0, "구분", "kc_505") kc_507.insert(0, "구분", "kc_507")

df = pd.concat([kc_493, kc_495, kc_505, kc_507])

df2 = df[df['sido_nm']=='제주특별자치도']

df2

name_map = {label:idx for idx, label in enumerate(np.unique(list(df2['id_poi'])))} len(name_map)

num=[]

for i in range(len(df2)): num.append(i)

지점명, 구주소, 신주소, 좌표(lat), 좌표(lon), 영업시간, 휴무일, 주차, 평균소요시간, 태그, 개요, key_add, key_gps

bc_data = df2

bc_data = bc_data.loc[:,['구분','poi_nm', 'sido_nm', 'sgg_nm','bemd_nm','ri_nm', 'beonji','rd_nm','bld_num','y','x','mcate_nm']]

bc_data

mcate_nm 태그로 넣기

bc_list = bc_data.values.tolist()

col2 = ['구분','poi_nm', 'sido_nm', 'sgg_nm','bemd_nm','ri_nm', 'beonji','rd_nm','bld_num','y','x','mcate_nm']

bc_data2 = pd.DataFrame(bc_list, columns = col2)

bc_data3 = bc_data2.fillna("")

bc_data4 = bc_data3.loc[:, ['poi_nm', 'y', 'x']]

key_gps = []

for i in range(len(bc_data4)): name = bc_data4['poi_nm'][i] y = round(bc_data4['y'][i],8) x = round(bc_data4['x'][i],8) key = name+ " " + str(y) + " " + str(x) key_gps.append(key)

bc_data3['old_add'] = bc_data3['sido_nm'].str.cat(bc_data3['sgg_nm'], sep=" ")

bc_data3['new_add'] = bc_data3['old_add'].str.cat(bc_data3['rd_nm'], sep=" ")

bc_data3['new_add'] = bc_data3['new_add'].str.cat(bc_data3['bld_num'], sep=" ")

bc_data3['old_add'] = bc_data3['old_add'].str.cat(bc_data3['bemd_nm'], sep=" ")

bc_data3['old_add'] = bc_data3['old_add'].str.cat(bc_data3['ri_nm'], sep=" ")

bc_data3['old_add'] = bc_data3['old_add'].str.cat(bc_data3['beonji'], sep=" ")

bc_data3['key_gps'] = key_gps

bc_data3

dup=bc_data3[bc_data3.duplicated(['key_gps'], keep='first')]

dup

dup2=bc_data3[bc_data3.duplicated(['key_gps'], keep='last')] dup2

bc_data3=bc_data3.drop_duplicates('key_gps')

bc_df = bc_data3.loc[:, ['구분','poi_nm','old_add','new_add','y','x','mcate_nm','key_gps']]

bc_df

bc_df.loc[bc_df.poi_nm== bc_df.mcate_nm, 'mcate_nm'] = np.nan

bc_df.insert(6, "영업시간", np.nan) bc_df.insert(7, "휴무일", np.nan) bc_df.insert(8, "주차가능여부", np.nan) bc_df.insert(9, "평균소요시간", np.nan) bc_df.insert(11, "개요", np.nan) bc_df.insert(12, "key_add", np.nan) bc_df.insert(14, "m_check", np.nan)

col_name = ['구분', '지점명', '구주소', '신주소', '좌표(lat)', '좌표(lon)', '영업시간', '휴무일', '주차가능여부(1-true, 0-false)', '평균소요시간(분)', '태그', '개요', 'key_add', 'key_gps', 'm_check']

bc_df.columns = ['구분', '지점명', '구주소', '신주소', '좌표(lat)', '좌표(lon)', '영업시간', '휴무일', '주차가능여부(1-true, 0-false)', '평균소요시간(분)', '태그', '개요', 'key_add', 'key_gps', 'm_check']

pp_bc = pd.concat([pp_raw, bc_df])

pp_bc

api_m.insert(0,"구분", "TourAPI") api_m.insert(3,"구주소", np.nan) api_m.insert(10,"소요시간", np.nan)

api_m.columns = ['구분', '지점명', '태그', '구주소', '신주소', '좌표(lat)', '좌표(lon)', '영업시간', '휴무일', '주차가능여부(1-true, 0-false)', '평균소요시간(분)', '개요', 'key_add', 'key_gps', 'm_check']

api_m=api_m[['구분','지점명', '구주소', '신주소', '좌표(lat)', '좌표(lon)', '영업시간', '휴무일', '주차가능여부(1-true, 0-false)', '평균소요시간(분)', '태그', '개요', 'key_add', 'key_gps', 'm_check']] # 프리포스트 순서와 동일하게 컬럼 순서 변경

pp_bc_api = pd.concat([pp_bc, api_m])

listall = pp_bc_api.values.tolist()

df_all = pd.DataFrame(listall, columns = col_name)

df_final = df_all.loc[:, ['구분','지점명', '구주소','신주소', '좌표(lat)', '좌표(lon)', '영업시간', '휴무일', '주차가능여부(1-true, 0-false)', '평균소요시간(분)', '태그', '개요']]

df_final

park_map = {label:idx for idx, label in enumerate(np.unique(list(df_final['주차가능여부(1-true, 0-false)'])))} park_map

df_final.insert(9, "park_raw", np.nan)

df_final['park_raw']=df_final['주차가능여부(1-true, 0-false)']

df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "", '주차가능여부(1-true, 0-false)'] = np.nan df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "가게앞 3대", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "가능", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "공용주차장", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "있음", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "가능", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "있음 (약 소형 150대, 대형10대 주차가능)", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "있음(100대)", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "주차 가능", '주차가능여부(1-true, 0-false)'] = 1 df_final.loc[ df_final['주차가능여부(1-true, 0-false)'] == "주차가능", '주차가능여부(1-true, 0-false)'] = 1

park_map2 = {label:idx for idx, label in enumerate(np.unique(list(df_final['주차가능여부(1-true, 0-false)'])))} park_map2

df_final.to_excel('D:/ML/prepost/df_final.xlsx', index=False)

dup=pp_bc_api[pp_bc_api.duplicated(['key_gps'], keep='first')] dup2=pp_bc_api[pp_bc_api.duplicated(['key_gps'], keep='last')]

dup

dup2