データサイエンスの課題をやってみた


            
            import pandas as pd # データ分析に用いるライブラリ
            import matplotlib.pyplot as plt # グラフ表示に用いるライブラリ
            pd.set_option('display.unicode.east_asian_width', True) # 表示のずれを少し緩和
            plt.rcParams['font.family'] = 'IPAexGothic' # グラフ表示におけるフォントの指定
            data_path = "./data.csv"
            df_data = pd.read_csv(data_path, encoding="utf-8-sig")
            # print(df_data["合計時間"].describe())
            # print(df_data.groupby(["間取り"]).count())
            print("物件数", len(df_data), "件")
            # ワンルーム全体の相場を調べる---------------------------------------
            mask=(df_data["間取り"]=="ワンルーム")
            df_selected_roomtype=df_data[mask]
            df_selected_roomtype.loc[:, "家賃"].hist(range=(0,20),bins=40)
            plt.xlabel("ワンルーム家賃(万円)") # 横軸のラベル
            plt.ylabel("件数") # 縦軸のラベル
            plt.xlim(0, 16)
            plt.title("ワンルーム家賃のヒストグラム") # グラフのタイトル
            plt.show()
            # -------------------------------------------------------------
            # 路線毎のワンルーム相場 -------------------------------------
            #エラーが出たので山崎先生に教えてもらいました。
            axes = df_selected_roomtype.loc[:, "家賃"].hist(by=df_selected_roomtype.loc[:, "路線"],
                                                             range=(0, 10), bins=30,
                                                             figsize=(14, 7),
                                                             sharex=True, sharey=True)
            for ax in axes.reshape(-1):
                ax.grid(b=True) # グリッドを表示
                ax.set_xlabel("家賃(万円)") # 横軸のラベル
                ax.set_ylabel("件数") # 縦軸のラベル
                # ax.set_ylim(0, 120) # 縦軸の目盛りの最小値と最大値の指定
            plt.suptitle("路線ごとのワンルーム家賃のヒストグラム") # グラフ全体のタイトル
            plt.subplots_adjust(top=0.92, # グラフ位置の微調整
                                hspace=0.3) # グラフ間の微調整
            plt.show()
            # ------------------------------------------------------
            # 希望家賃(万)
            rent_upper = int(input("希望家賃の上限は？(万円)"))
            rent_lower = int(input("希望家賃の下限は？(万円)"))
            # 家賃で絞り込む
            mask = (df_selected_roomtype['家賃'] <= rent_upper) & (df_selected_roomtype['家賃'] >= rent_lower)
            # 絞った内容ををfilter_dfに入れる
            filter_df = df_selected_roomtype[mask]
            print("家賃で絞り込んだ結果", len(filter_df), "件")
            # 通学時間---------------------------------------------------
            filter_df.loc[:, "合計時間"].hist(range=(0,120),bins=120)
            plt.xlabel("通学時間(分)") # 横軸のラベル
            plt.ylabel("件数") # 縦軸のラベル
            plt.xlim(0, 120)
            plt.title(str(rent_lower) +"-" + str(rent_upper)+"万円のワンルーム：通学時間ヒストグラム") # グラフのタイトル
            plt.show()
            #----------------------------------------------------------
            # # 通学時間（分）
            commuting_time = float(input("希望通学時間上限は?(分)"))
            mask = (filter_df['合計時間'] <= commuting_time)
            # filter_dfをさらに合計時間で絞り込む(filter_dfを上書き)
            filter_df = filter_df[mask]
            print("家賃,通学時間で絞り込んだ結果", len(filter_df), "件")
            # バスは使いますか?
            bus = input('バスは使ってもいいですか？　y or n')
            if bus == 'n':
                # バスなしであれば、バスは0
                bus_num = 0
            else:
                # とりあえずありえない数字を入れておく
                bus_num = 10000
            mask = (filter_df['バス'] <= bus_num)
            # filter_dfをさらにバス有無で絞り込み
            filter_df = filter_df[mask]
            print("家賃,通学時間、バス有無で絞り込んだ結果", len(filter_df), "件")
            # 条件にあう分件が多い駅top10を表示(書き方難しかった)
            filter_top10_df = filter_df.groupby(['駅'])['駅'].count().sort_values(ascending=False).head(10)
            # 続けるかどうかのフラグ
            flag = True
            while (flag):
                print('あなたにおすすめの駅は')
                print(filter_top10_df)
                # 　条件に合致する物件数が多い上位10の駅を「おすすめ駅」とする
                # 絞り込んだ結果をさらに「おすすめ駅」で絞り込み(ここちょっと書き方難しい)
                mask = filter_df['駅'].isin(filter_top10_df.index)
                # filter_dfからおすすめ駅に絞り込んだ結果をfilter_top10_station_dfに入れる
                filter_top10_station_df = filter_df[mask]
                # filter_top10_station_df.loc[:, "家賃"].hist(by=filter_top10_station_df.loc[:, "駅"],range=(rent_lower,rent_upper),bins=10,figsize=(8, 8))
                # plt.show()
                # ヒストグラム-------------------------------------
                #エラーが出たので山崎先生に教えてもらいました。
                axes = filter_top10_station_df.loc[:, "家賃"].hist(by=filter_top10_station_df.loc[:, "駅"],
                                                                 range=(rent_lower, rent_upper), bins=10,
                                                                 figsize=(10, 8),
                                                                 sharex=True, sharey=True)
                for ax in axes.reshape(-1):
                    ax.grid(b=True) # グリッドを表示
                    ax.set_xlabel("家賃(万円)") # 横軸のラベル
                    ax.set_ylabel("件数") # 縦軸のラベル
                plt.suptitle("おすすめ駅：家賃のヒストグラム（家賃制限内）") # グラフ全体のタイトル
                plt.subplots_adjust(top=0.92, # グラフ位置の微調整
                                    hspace=0.3) # グラフ間の微調整
                plt.show()
                # ------------------------------------------------------
                print("おすすめ駅(10駅)物件数", len(filter_top10_station_df))
                station = input('駅名を入力してください')
                # おすすめ駅Top10（filter_top10_station_df）から駅名で更に絞り込みfilter_selected_station_dfに入れる
                mask = (filter_top10_station_df['駅'] == station)
                filter_selected_station_df = filter_top10_station_df[mask]
                print(station + "の合致物件数", len(filter_selected_station_df), "件")
                # 全件表示させるようにする
                pd.set_option('display.max_rows', None)
                pd.set_option('display.max_columns', None)
                # 選択した駅の駅、路線、家賃、名称、合計時間、URLを表示する
                print(filter_selected_station_df[["駅", "路線","家賃", "名称", "合計時間", "URL"]].sort_values("家賃").to_string(index=False))
                print("----------------------------------------------------------------")
                loop_input = input("続けますか？ y or n")
                if loop_input == 'n':
                    flag = False
道用も山崎先生の課題をやってみた

ワンルーム家賃相場感

路線毎に違いはあるか？？

家賃を絞って通学時間を見てみる

通学時間、バスの有無でも物件を絞る→おすすめ駅を抽出

物件の表示

ソースコード

使い方動画