Beijing_data_vis_2

2020-04-30

Data Process

接下来开始处理数据

Loading Lib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
Import json

Tourism Attractions data clean

1	hotSpot = pd.read_csv("bj_hotspots.csv",encoding="utf-8")

hotSpots

剔除area那列多余的文字以及符号

# Function to clean the neighbourhood 
def Clean_areas(area): 
    # Search for blank-space in the name followed by 
    # any characters repeated any number of times 
    if("·" in area):
        area = area.split('·')
        return area[-1] 
    else: 
        # if clean up needed return the same name 
        return area

1	hotSpot.loc[:,"area"] = hotSpot["area"].apply(Clean_areas)

计算各个区域内热门景点数量

num_spots = hotSpot.sort_values(by="area").groupby("area").size()
num_spots

area
东城区       19
丰台区        5
大兴区        1
奥林匹克公园     1
宣武区        1
密云县        7
平谷区        4
延庆县        6
怀柔区        7
房山区        7
昌平区        7
明十三陵       3
朝阳区        7
海淀区       18
石景山区       2
西城区       15
门头沟区       5

覆盖为行政区域

# rearrange spot's area matching with airbnb dataset
def sort_spots_area(data):
    for key,item in list(data.items()):
        if key == '奥林匹克公园':
            data['朝阳区'] = data['朝阳区']+item
            del data[key]
        if key == '明十三陵':
            data['昌平区'] = data['昌平区']+item
            del data[key]
        if key == '宣武区':
            data['西城区'] = data['西城区']+item
            del data[key]
    return data

area
东城区     19
丰台区      5
大兴区      1
密云县      7
平谷区      4
延庆县      6
怀柔区      7
房山区      7
昌平区     10
朝阳区      8
海淀区     18
石景山区     2
西城区     16
门头沟区     5

Airbnb Lists data clean

//打开listingForVis表格
bjList = pd.read_csv("listingForVis.csv",encoding="utf-8",low_memory=False)

//打开listings表格
bj_Lists = pd.read_csv("listings.csv",encoding="utf-8",low_memory=False)

//从listings表格中取出需要的信息
ex_col = bj_Lists.loc[:,["id","property_type","number_of_reviews","review_scores_rating","reviews_per_month"]]

//合并表格
bjListV2 = bjList.merge(ex_col)

listsNaInfo

移除价格异常值数据，控制在百分之2至百分之98之间。

# remove out liers
outliers_low = bjListV2["price"].quantile(0.2)
outliers_hi  = bjListV2["price"].quantile(0.98)
# lists wihtout outliers in price
bjListV2_filtered = bjListV2[(bjListV2["price"] < outliers_hi) & (bjListV2["price"] > outliers_low)]

清理neighbourhood列中的文字格式。

# Function to clean the neighbourhood 
def Clean_names(neighbourhood): 
    # Search for blank-space in the name followed by 
    # any characters repeated any number of times 
    if re.search('\s.*', neighbourhood): 
  
        # Extract the position of beginning of pattern 
        pos = re.search('\s.*', neighbourhood).start() 
  
        # return the cleaned name 
        return neighbourhood[:pos] 
  
    else: 
        # if clean up needed return the same name 
        return neighbourhood 
        
bjListV2_filtered.loc[:,"neighbourhood"] = bjListV2_filtered["neighbourhood"].apply(Clean_names)

区域数量统计

# count lists by areas
num_by_area = bjListV2_filtered.groupby("neighbourhood")["id"].count()
bjListV2_filtered.groupby("neighbourhood")["id"].count()

neighbourhood
东城区     2780
丰台区     1026
大兴区      292
密云县     1010
平谷区      112
延庆县     1195
怀柔区      763
房山区      193
昌平区      425
朝阳区     5620
海淀区     1575
石景山区     175
西城区     1208
通州区      355
门头沟区      67
顺义区      566
Name: id, dtype: int64

区域房型统计

1	type_ares = bjListV2_filtered.groupby(["neighbourhood","room_type"])["id"].agg([("id","count")])

评论统计，首先清理评论为空的数据

# delete items with no scores
score_List = bjListV2_filtered.dropna(subset=["review_scores_rating"]).drop(["neighbourhood_group","minimum_nights"],axis='columns')

# check how many items in the lists
score_List["id"].count()
10098

总共有10098家，通过区域数量计算出区域平均评论数量和评分

1 2	# average scores by areas mean_score = score_List.groupby("neighbourhood")["review_scores_rating","number_of_reviews"].mean().reset_index()

把区域热点景点数据与区域评论数据合并

# get 通州区 & 顺义区	which has no hotspots data
temp = mean_score.sort_values(by="neighbourhood").iloc[[13,15],:]

# drop two areas with no spots info, temporarily
#add new column to 'mean_score' with the number of spots named 'num_spot'
add_spot = mean_score.sort_values(by="neighbourhood").drop([13,15],axis=0)
add_spot["num_spot"] = spots_list

# merge temp to to mean_score
final_spot = pd.concat([add_spot, temp],sort=True)

final_spot = final_spot.fillna(0).sort_values(by="neighbourhood")
final_spot

计算区域平均日房租，并加入到区域表格中

# mean rent price by area
mean_rent_area = score_List.groupby(["neighbourhood"])["price"].mean().reset_index()
score_List.groupby(["neighbourhood"])["price"].mean().reset_index()

# add average rent to each area
final_spot['rent'] = mean_rent_area['price']
final_spot

读取链家房价数据，并将其加入到区域表格中

# Housing price by areas
bjPrice = pd.read_csv("dataset/housing_price_bj.csv",encoding="utf-8")
bjPrice = bjPrice.sort_values(by=["name"]).drop([9,17],axis=0).reset_index().drop('index',axis=1)
bjPrice["name"] = neighbourhood

# add Housing price to each area
final_spot['housingPrice'] = bjPrice['transPrice']

将区域Airbnb数量加入区域表格中

1
2
3

# add Number of lists to each area
final_spot['count'] = num_list['id']
final_spot = final_spot.rename(columns={'number_of_reviews': 'mean_reviews','review_scores_rating':'mean_scores'})

处理百分位输出csv

1	final_spot.round(1).to_csv('beijing_area_stat.csv',encoding="utf-8",index=None)

计算相关性

correlation

版权声明： 本博客所有文章除特别声明外，均采用 Apache License 2.0 许可协议。转载请注明出处！