Beijing_data_vis_2

Data Process

接下来开始处理数据

Loading Lib

1
2
3
4
5
6
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
Import json

Tourism Attractions data clean

1
hotSpot = pd.read_csv("bj_hotspots.csv",encoding="utf-8")

hotSpots

剔除area那列多余的文字以及符号

1
2
3
4
5
6
7
8
9
10
# Function to clean the neighbourhood 
def Clean_areas(area):
# Search for blank-space in the name followed by
# any characters repeated any number of times
if("·" in area):
area = area.split('·')
return area[-1]
else:
# if clean up needed return the same name
return area
1
hotSpot.loc[:,"area"] = hotSpot["area"].apply(Clean_areas)

计算各个区域内热门景点数量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
num_spots = hotSpot.sort_values(by="area").groupby("area").size()
num_spots

area
东城区 19
丰台区 5
大兴区 1
奥林匹克公园 1
宣武区 1
密云县 7
平谷区 4
延庆县 6
怀柔区 7
房山区 7
昌平区 7
明十三陵 3
朝阳区 7
海淀区 18
石景山区 2
西城区 15
门头沟区 5

覆盖为行政区域

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# rearrange spot's area matching with airbnb dataset
def sort_spots_area(data):
for key,item in list(data.items()):
if key == '奥林匹克公园':
data['朝阳区'] = data['朝阳区']+item
del data[key]
if key == '明十三陵':
data['昌平区'] = data['昌平区']+item
del data[key]
if key == '宣武区':
data['西城区'] = data['西城区']+item
del data[key]
return data

area
东城区 19
丰台区 5
大兴区 1
密云县 7
平谷区 4
延庆县 6
怀柔区 7
房山区 7
昌平区 10
朝阳区 8
海淀区 18
石景山区 2
西城区 16
门头沟区 5

Airbnb Lists data clean

1
2
3
4
5
6
7
8
9
10
11
//打开listingForVis表格
bjList = pd.read_csv("listingForVis.csv",encoding="utf-8",low_memory=False)

//打开listings表格
bj_Lists = pd.read_csv("listings.csv",encoding="utf-8",low_memory=False)

//从listings表格中取出需要的信息
ex_col = bj_Lists.loc[:,["id","property_type","number_of_reviews","review_scores_rating","reviews_per_month"]]

//合并表格
bjListV2 = bjList.merge(ex_col)

listsNaInfo

移除价格异常值数据,控制在百分之2至百分之98之间。

1
2
3
4
5
# remove out liers
outliers_low = bjListV2["price"].quantile(0.2)
outliers_hi = bjListV2["price"].quantile(0.98)
# lists wihtout outliers in price
bjListV2_filtered = bjListV2[(bjListV2["price"] < outliers_hi) & (bjListV2["price"] > outliers_low)]

清理neighbourhood列中的文字格式。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Function to clean the neighbourhood 
def Clean_names(neighbourhood):
# Search for blank-space in the name followed by
# any characters repeated any number of times
if re.search('\s.*', neighbourhood):

# Extract the position of beginning of pattern
pos = re.search('\s.*', neighbourhood).start()

# return the cleaned name
return neighbourhood[:pos]

else:
# if clean up needed return the same name
return neighbourhood

bjListV2_filtered.loc[:,"neighbourhood"] = bjListV2_filtered["neighbourhood"].apply(Clean_names)

区域数量统计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# count lists by areas
num_by_area = bjListV2_filtered.groupby("neighbourhood")["id"].count()
bjListV2_filtered.groupby("neighbourhood")["id"].count()

neighbourhood
东城区 2780
丰台区 1026
大兴区 292
密云县 1010
平谷区 112
延庆县 1195
怀柔区 763
房山区 193
昌平区 425
朝阳区 5620
海淀区 1575
石景山区 175
西城区 1208
通州区 355
门头沟区 67
顺义区 566
Name: id, dtype: int64

区域房型统计

1
type_ares = bjListV2_filtered.groupby(["neighbourhood","room_type"])["id"].agg([("id","count")])

评论统计,首先清理评论为空的数据

1
2
3
4
5
6
# delete items with no scores
score_List = bjListV2_filtered.dropna(subset=["review_scores_rating"]).drop(["neighbourhood_group","minimum_nights"],axis='columns')

# check how many items in the lists
score_List["id"].count()
10098

总共有10098家,通过区域数量计算出区域平均评论数量和评分

1
2
# average scores by areas
mean_score = score_List.groupby("neighbourhood")["review_scores_rating","number_of_reviews"].mean().reset_index()

把区域热点景点数据与区域评论数据合并

1
2
3
4
5
6
7
8
9
10
11
12
13
# get 通州区 & 顺义区	which has no hotspots data
temp = mean_score.sort_values(by="neighbourhood").iloc[[13,15],:]

# drop two areas with no spots info, temporarily
#add new column to 'mean_score' with the number of spots named 'num_spot'
add_spot = mean_score.sort_values(by="neighbourhood").drop([13,15],axis=0)
add_spot["num_spot"] = spots_list

# merge temp to to mean_score
final_spot = pd.concat([add_spot, temp],sort=True)

final_spot = final_spot.fillna(0).sort_values(by="neighbourhood")
final_spot

计算区域平均日房租,并加入到区域表格中

1
2
3
4
5
6
7
# mean rent price by area
mean_rent_area = score_List.groupby(["neighbourhood"])["price"].mean().reset_index()
score_List.groupby(["neighbourhood"])["price"].mean().reset_index()

# add average rent to each area
final_spot['rent'] = mean_rent_area['price']
final_spot

读取链家房价数据,并将其加入到区域表格中

1
2
3
4
5
6
7
# Housing price by areas
bjPrice = pd.read_csv("dataset/housing_price_bj.csv",encoding="utf-8")
bjPrice = bjPrice.sort_values(by=["name"]).drop([9,17],axis=0).reset_index().drop('index',axis=1)
bjPrice["name"] = neighbourhood

# add Housing price to each area
final_spot['housingPrice'] = bjPrice['transPrice']

将区域Airbnb数量加入区域表格中

1
2
3
# add Number of lists to each area
final_spot['count'] = num_list['id']
final_spot = final_spot.rename(columns={'number_of_reviews': 'mean_reviews','review_scores_rating':'mean_scores'})

处理百分位输出csv

1
final_spot.round(1).to_csv('beijing_area_stat.csv',encoding="utf-8",index=None)

计算相关性

correlation

  • 版权声明: 本博客所有文章除特别声明外,均采用 Apache License 2.0 许可协议。转载请注明出处!
  • © 2020 Ruoyu Wang
  • PV: UV:

请我喝杯咖啡吧~

支付宝
微信