情况:
我尝试从//File Path
hid_t H5_hid_RESULTS = H5Fcreate (V_FIn_HDF5_Path.absoluteFilePath().toUtf8().constData(), H5F_ACC_RDONLY, H5P_DEFAULT, H5P_DEFAULT);
//Status (Error Output?)
herr_t status;
//read dataset "heigth"
int32_t heigth[1];
hid_t H5_hid_heigth = H5Dopen1(H5_hid_RESULTS, "heigth");
status = H5Dread(H5_hid_heigth, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, H5P_DEFAULT, heigth);
qDebug() << "heigth" << heigth[0];
status = H5Dclose(H5_hid_heigth);
//Close: file
status = H5Fclose (H5_hid_RESULTS);
文件中读取一个值。
系统:
我的代码:
int[1]
目标 .hdf5文件(在查看器中):
结果:
qDebug打印一个随机数(例如104610208),而不是预期的512。
问题:
我尝试过的事情:
int32_t[1]
代替int
:结果相同。nt32_t[1]
代替i D:\...\xyz.cpp:47: Error: C2664: "herr_t H5Dread(hid_t,hid_t,hid_t,hid_t,hid_t,void *)" : Conversion from argument 6 from "int" to "void *" not possible
:
int*
int32_t[1]
代替0x7fe00000001
:*heigth
我猜想值的地址。如果我尝试打印heigth
而不是"/heigth"
,则在调用函数时会崩溃"heigth"
而不是import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
# Based on the following which has more examples:
# http://nbviewer.jupyter.org/github/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# http://michelleful.github.io/code-blog//2015/06/18/classifying-roads/
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/49466193/how-to-add-a-feature-to-a-vectorized-data-set/49501769#49501769
# Load ANSI file into pandas dataframe.
df = pd.read_csv(r'e:/work/python/papf.txt', encoding = 'latin1', usecols=['LAST_NAME', 'RACE'])
# Convert last name to lower case.
df['LAST_NAME'] = df['LAST_NAME'].str.lower()
# Remove the last name spaces.
# df['LAST_NAME'] = df['LAST_NAME'].str.replace(' ', '')
# Remove all rows where race is NOT in African, Coloured, White, Indian.
df = df.drop(df[~df['RACE'].isin(['African', 'Coloured', 'White', 'Indian'])].index)
# Returns a column from the dataframe named df as a numpy array of type string.
class TextExtractor(BaseEstimator, TransformerMixin):
"""Adapted from code by @zacstewart
https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
Also see Zac Stewart's excellent blogpost on pipelines:
http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
"""
def __init__(self, column_name):
self.column_name = column_name
def transform(self, df):
# Select the relevant column and return it as a numpy array.
# Set the array type to be string.
return np.asarray(df[self.column_name]).astype(str) # This refers to the df passed as a parameter, and not to the global scope one.
def fit(self, *_):
return self
class Apply(BaseEstimator, TransformerMixin):
"""Takes in a function and applies it element-wise to every element in the numpy array it's supplied with."""
def __init__(self, fn):
self.fn = np.vectorize(fn)
def transform(self, data):
# Note: reshaping is necessary because otherwise sklearn
# interprets the 1-d array as a single sample.
return self.fn(data.reshape(data.size, 1))
def fit(self, *_):
return self
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
"""Takes in dataframe, extracts last name column, outputs average word length"""
def __init__(self):
pass
def average_word_length(self, name):
"""Helper code to compute average word length of a name"""
return np.mean([len(word) for word in name.split()])
def transform(self, df, y=None):
"""The workhorse of this feature extractor"""
return df['LAST_NAME'].apply(self.average_word_length) # This refers to the df passed as a parameter, and not to the global scope one.
def fit(self, df, y=None):
"""Returns self unless something different happens in train and test"""
return self
# Let's pick the same random 10% of the data to train with.
random.seed(1965)
train_test_set = df.loc[random.sample(list(df.index.values), int(len(df) / 10))]
# X = train_test_set[['road_name', 'has_malay_road_tag']]
X = train_test_set[['LAST_NAME']]
y = train_test_set['RACE']
vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
clf = LinearSVC() # #MultinomialNB() #linear_model.SGDClassifier(max_iter=500)
pipeline = Pipeline([
('name_extractor', TextExtractor('LAST_NAME')), # Extract names from df.
('text_features', FeatureUnion([
('vect', vect), # Extract ngrams from names.
('num_words', Apply(lambda s: len(s.split()))), # Number of words.
('ave_word_length', Apply(lambda s: np.mean([len(w) for w in s.split()]))), # Average word length.
])),
('clf' , clf), # Feed the output through a classifier.
])
def run_experiment(X, y, pipeline, num_expts=100):
scores = list()
for i in range(num_expts):
X_train, X_test, y_train, y_true = train_test_split(X, y)
model = pipeline.fit(X_train, y_train) # Train the classifier.
y_test = model.predict(X_test) # Apply the model to the test data.
#print(X_test)
#print(type(X_test))
score = accuracy_score(y_test, y_true) # Compare the results to the gold standard.
scores.append(score)
print(sum(scores) / num_expts)
# Run x times (num_expts) and get the average accuracy.
run_experiment(X, y, pipeline, 1)
# Train a final model for use in the actual output.
X_train, X_test, y_train, y_true = train_test_split(X, y)
model = pipeline.fit(X_train, y_train) # Train the classifier.
df2 = pd.DataFrame(columns=['LAST_NAME'], data=[['Joemat']]) # Create a test case of one.
print(model.predict(df2))
# Solution to this part might be here: https://stackoverflow.com/questions/49466193/how-to-add-a-feature-to-a-vectorized-data-set/49501769#49501769
pg = {'clf__C': [0.1, 1, 10, 100]}
grid = GridSearchCV(pipeline, param_grid=pg, cv=5)
X_train, X_test, y_train, y_true = train_test_split(X, y)
grid.fit(X_train, y_train)
print(grid.best_params_)
# {'clf__C': 0.1}
print(grid.best_score_)
# 0.702290076336
:相同的结果。答案 0 :(得分:0)
如文件查看器所示,您的数据类型512为H5T_NATIVE_INT32
,而不是H5T_NATIVE_INT
。换句话说,您尝试读取只有32位整数的64位整数。这应该为您工作:
status = H5Dread(H5_hid_heigth, H5T_NATIVE_INT32,
H5S_ALL, H5S_ALL, H5P_DEFAULT, heigth);
答案 1 :(得分:0)
我受此example的启发,使用完全不同但非常简单的方法解决了这个问题。基本上与链接中的代码相同,但经过了极大简化,因此使用c ++和HDF5的新手可以理解它:
//open file (My path is zensored, project from work)
const H5std_string H5_Path_Results("C:/.../results.hdf5");
H5File H5_File_Results(H5_Path_Results, H5F_ACC_RDONLY);
//open set
const H5std_string H5_Nam_Height("height");
DataSet H5_Set_Height = H5_File_Results.openDataSet(H5_Nam_Height);
//read set
int height[1];
H5_Set_Height.read(height, PredType::NATIVE_INT, H5S_ALL, H5S_ALL);
qDebug() << height[0];
//close set
H5_Set_Height.close();
//close file
H5_File_Results.close();
这给出了我想读取的512输出。
当然需要#include <hdf5.h>
和#include <H5Cpp.h>
,必须将库添加到项目中,并且.hdf5
文件必须存在。