我需要以秒为单位读取带有时间戳的长文件,以及使用numpy或scipy的CDF图。我确实试过numpy,但似乎输出不是它应该是什么。以下代码:任何建议表示赞赏。
import numpy as np
import matplotlib.pyplot as plt
data = np.loadtxt('Filename.txt')
sorted_data = np.sort(data)
cumulative = np.cumsum(sorted_data)
plt.plot(cumulative)
plt.show()
答案 0 :(得分:19)
您有两种选择:
1:您可以先将数据分区。使用numpy.histogram
函数
import numpy as np import matplotlib.pyplot as plt data = np.loadtxt('Filename.txt') # Choose how many bins you want here num_bins = 20 # Use the histogram function to bin the data counts, bin_edges = np.histogram(data, bins=num_bins, normed=True) # Now find the cdf cdf = np.cumsum(counts) # And finally plot the cdf plt.plot(bin_edges[1:], cdf) plt.show()
2:而不是使用numpy.cumsum
,只需将sorted_data
数组绘制为小于数组中每个元素的项目数(请参阅此答案以获取更多详细信息https://stackoverflow.com/a/11692365/588071):< / p>
import numpy as np import matplotlib.pyplot as plt data = np.loadtxt('Filename.txt') sorted_data = np.sort(data) yvals=np.arange(len(sorted_data))/float(len(sorted_data)-1) plt.plot(sorted_data,yvals) plt.show()
答案 1 :(得分:5)
为了完整起见,您还应该考虑:
您可以使用numpy.histogram
,设置垃圾箱边缘,使每个垃圾箱只收集一个点的所有出现次数。
您应该保留density=False
,因为根据文档:
请注意,除非选择单位宽度的区间,否则直方图值的总和不会等于1
您可以标准化每个bin中的元素数量除以数据的大小。
import numpy as np
import matplotlib.pyplot as plt
def cdf(data):
data_size=len(data)
# Set bins edges
data_set=sorted(set(data))
bins=np.append(data_set, data_set[-1]+1)
# Use the histogram function to bin the data
counts, bin_edges = np.histogram(data, bins=bins, density=False)
counts=counts.astype(float)/data_size
# Find the cdf
cdf = np.cumsum(counts)
# Plot the cdf
plt.plot(bin_edges[0:-1], cdf,linestyle='--', marker="o", color='b')
plt.ylim((0,1))
plt.ylabel("CDF")
plt.grid(True)
plt.show()
例如,使用以下数据:
#[ 0. 0. 0.1 0.1 0.2 0.2 0.3 0.3 0.4 0.4 0.6 0.8 1. 1.2]
data = np.concatenate((np.arange(0,0.5,0.1),np.arange(0.6,1.4,0.2),np.arange(0,0.5,0.1)))
cdf(data)
你会得到:
您还可以插入cdf以获得连续函数(使用线性插值或三次样条曲线):
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
def cdf(data):
data_size=len(data)
# Set bins edges
data_set=sorted(set(data))
bins=np.append(data_set, data_set[-1]+1)
# Use the histogram function to bin the data
counts, bin_edges = np.histogram(data, bins=bins, density=False)
counts=counts.astype(float)/data_size
# Find the cdf
cdf = np.cumsum(counts)
x = bin_edges[0:-1]
y = cdf
f = interp1d(x, y)
f2 = interp1d(x, y, kind='cubic')
xnew = np.linspace(0, max(x), num=1000, endpoint=True)
# Plot the cdf
plt.plot(x, y, 'o', xnew, f(xnew), '-', xnew, f2(xnew), '--')
plt.legend(['data', 'linear', 'cubic'], loc='best')
plt.title("Interpolation")
plt.ylim((0,1))
plt.ylabel("CDF")
plt.grid(True)
plt.show()
答案 2 :(得分:2)
快速回答,
plt.plot(sorted_data, np.linspace(0,1,sorted_data.size)
应该得到你想要的东西
答案 3 :(得分:2)
以下是我实施的步骤:
1.分类您的数据
2.计算每个'x'
的累积概率public function pay(){
//Set variables for paypal form
$returnURL = site_url(AGENT_ROLE.$this->data['controller']."/Success"); //payment success url
$cancelURL = site_url(AGENT_ROLE.$this->data['controller']."/Cancelled");//payment cancel url
$notifyURL = site_url(AGENT_ROLE.$this->data['controller']."/Notification"); //ipn url
//get particular product data
$product = "test product";
$userID = 1; //current user id
$logo = base_url().'assets/images/logo/logo.png';
$this->paypal_lib->add_field('return', $returnURL);
$this->paypal_lib->add_field('cancel_return', $cancelURL);
$this->paypal_lib->add_field('notify_url', $notifyURL);
$this->paypal_lib->add_field('item_name', $product);
$this->paypal_lib->add_field('custom', $userID);
$this->paypal_lib->add_field('item_number', 1);
$this->paypal_lib->add_field('amount', 100.00);
$this->paypal_lib->image($logo);
$this->paypal_lib->paypal_auto_form();
}
function success(){
//get the transaction data
// $paypalInfo = $this->input->get();
// $paypalInfo2 = $this->input->post();
// $data['item_number'] = $paypalInfo['item_number'];
// $data['txn_id'] = $paypalInfo["tx"];
// $data['payment_amt'] = $paypalInfo["amt"];
// $data['currency_code'] = $paypalInfo["cc"];
// $data['status'] = $paypalInfo["st"];
// $item_name = $_POST['item_name'];
// $item_number = $_POST['item_number'];
// $payment_status = $_POST['payment_status'];
// $payment_amount = $_POST['mc_gross'];
// $payment_currency = $_POST['mc_currency'];
// $txn_id = $_POST['txn_id'];
// $receiver_email = $_POST['receiver_email'];
// $payer_email = $_POST['payer_email'];
//pass the transaction data to view
var_dump(fsockopen ('https://www.sandbox.paypal.com/', 443, $errno, $errstr, 30));
var_dump($_POST);
}
function cancel(){
// $this->load->view('paypal/cancel');
echo "Cancelled";
}
function ipn(){
//paypal return transaction details array
$paypalInfo = $this->input->post();
$data['user_id'] = $paypalInfo['custom'];
$data['product_id'] = $paypalInfo["item_number"];
$data['txn_id'] = $paypalInfo["txn_id"];
$data['payment_gross'] = $paypalInfo["mc_gross"];
$data['currency_code'] = $paypalInfo["mc_currency"];
$data['payer_email'] = $paypalInfo["payer_email"];
$data['payment_status'] = $paypalInfo["payment_status"];
$paypalURL = $this->paypal_lib->paypal_url;
// $paypalURL = 'https://www.sandbox.paypal.com/cgi-bin/webscr';
$result = $this->paypal_lib->curlPost($paypalURL,$paypalInfo);
//check whether the payment is verified
if(preg_match("/VERIFIED/i",$result)){
//insert the transaction data into the database
// $this->product->insertTransaction($data);
var_dump("IPN SUCCESS");
}
}
示例:
import numpy as np
import matplotlib.pyplab as plt
def cdf(data):
n = len(data)
x = np.sort(data) # sort your data
y = np.arange(1, n + 1) / n # calculate cumulative probability
return x, y
x_data, y_data = cdf(your_data)
plt.plot(x_data, y_data)
答案 4 :(得分:1)
如果存在许多重复值(这是因为我们只需要对唯一值进行排序),这里的实现效率会更高一些。它将CDF绘制为阶梯函数,严格来说就是这样。
import sys
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
def read_data(fp):
t = []
for line in fp:
x = float(line.rstrip())
t.append(x)
return t
def main(script, filename=None):
if filename is None:
fp = sys.stdin
else:
fp = open(filename)
t = read_data(fp)
counter = Counter(t)
xs = counter.keys()
xs.sort()
ys = np.cumsum(counter.values()).astype(float)
ys /= ys[-1]
options = dict(linewidth=3, alpha=0.5)
plt.step(xs, ys, where='post', **options)
plt.xlabel('Values')
plt.ylabel('CDF')
plt.show()
if __name__ == '__main__':
main(*sys.argv)
答案 5 :(得分:0)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('Filename.txt', sep=" ", header=None)
plt.figure()
sns.kdeplot(data,cumulative=True)
plt.show()