问题描述:

Im using the following code:

import matplotlib.pyplot as pyplot

import pandas as pandas

from datetime import datetime

dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)

print('filter data for SrcAddr')

dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']

print('get Values')

varY = dataset_filtered.Battery_Millivolt.values

varX = dataset_filtered.Timestamp.values

print('Convert the date-strings in date-objects.')

dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]

fig = pyplot.figure()

ax1 = fig.add_subplot(1,1,1)

ax1.set_xlabel('Time')

ax1.set_ylabel('Millivolt')

ax1.bar(dates_list, varY)

pyplot.locator_params(axis='x',nbins=10)

pyplot.show()

The problem i have is, its a large datacollection with 180k datapoints.

And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".

What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.

网友答案:

Resampling can be done with the resample function from pandas.

Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
             np.random.rand(len(df.index))*100

# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
                 # try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")

fig, ax = plt.subplots()

#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()

plt.show()

Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.

This is how it would look like:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates

class Sampler():
    def __init__(self,df):
        self.df = df

    def resample(self, limits):
        print limits
        dt = limits[1] - limits[0]
        if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
            dt = datetime.timedelta(days=dt)
        print dt
        #see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
        if dt > datetime.timedelta(hours=5):
            t = "H"; width=1./24
        elif dt > datetime.timedelta(minutes=60):
            t = "15T"; width=15./(24.*60)
        elif dt > datetime.timedelta(minutes=5):
            t = "T"; width=1./(24.*60)
        elif dt > datetime.timedelta(seconds=60):
            t = "15S"; width=15./(24.*60*60)
        else: 
            #dt < datetime.timedelta(seconds=60):
            t = "S"; width=1./(24.*60*60)

        self.resampled = pd.DataFrame()
        self.resampled['data'] = self.df.data.resample(t, how="mean")
        print t, len(self.resampled['data'])
        print "indextype",  type(self.resampled.index[0])
        print "limitstype",  type(limits[1])
        if type(limits[1]) == float or type(limits[1]) == np.float64 :
            dlowlimit = matplotlib.dates.num2date(limits[0])
            duplimit = matplotlib.dates.num2date(limits[1])
            print type(duplimit), duplimit
            self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
            self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
        else:
            self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
            self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
        return self.resampled.index,self.resampled['data'],width

    def update(self, ax):
        print "update"
        lims = ax.viewLim
        start, stop = lims.intervalx
        ax.clear()
        x,y,width = self.resample([start, stop])
        ax.bar(x,y, width=width)
        ax.set_xlim([start, stop])
        ax.callbacks.connect('xlim_changed', self.update)
        ax.figure.canvas.draw()




times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
             np.random.rand(len(df.index))*500


sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )


fig, ax = plt.subplots()

ax.bar(x,y, width=width)
ax.xaxis_date()

# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)

plt.show()
网友答案:

One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.

After you get your dataset_filtered DataFrame, take a sample of it like this

dataset_filtered_sample = dataset_filtered.sample(frac=.001)
相关阅读:
Top