我正在从csv文件中获取一列,并使用pandas将其中的数据输入到数组中。但是,许多单元格为空,并以“ nan”的形式保存在数组中。我想识别空单元格,以便跳过它们或将它们全部从阵列中删除。类似于以下的伪代码:
if df.row(column number) == nan
skip
或
if df.row(column number) != nan
do stuff
基本上,我如何确定csv文件中的单元格是否为空。
答案 0 :(得分:0)
最好是在加载后摆脱掉#include <stdlib.h>
#include <vector>
#include <string>
#include <cstring>
#include <iostream>
#include <algorithm>
using namespace std;
void lexEme(string str);
string getTokenID(string str);
vector <string> tokenize(string str);
string ReplaceAll(string str, string from, string to);
bool is_number(string s);
bool isalphanum(string s);
bool isOperator(char str);
vector<string> split(string str, string token);
vector<string> simpleSplit(string str, string token);
static bool is_decimal(string str);
string merge(vector<string> x);
string toLower(string str);
string toUpper(string str);
bool contain(string str , char token);
vector<string> betweenQuotes(string str);
//Store alphabet we are matching
int main(){
cout<< "Enter string" << endl;
string x;
vector<string> in;
while (getline(cin , x)){
if(x.empty()){
break;
}
in.push_back(x);
}
for(int i = 0;i < in.size(); i++){
lexEme(in[i]);
}
cout << "done" << endl;
return 0;
}
//This handles the lexical analysis
void lexEme(string str){
//We store our broken up string in here
vector<string> tokens;
//We handle some preparsing to avoid potential errors when reading lines with ( and )
str = ReplaceAll(str, "(", " ( ");
str = ReplaceAll(str, ")", " ) ");
str = ReplaceAll(str , "\n" , " ");
bool base = true;
bool hasStrings = false;
vector<string> temp;
//We assign tokens to our tokenized string
// str = ReplaceAll(str , "\"" , "?\"" );
for (int k = 0; k < str.length(); ++k) {
if(str[k] == '"') hasStrings = true;
if(str[k] == ' ') base = false;
}
if(base){
// cout << "this is base" << endl;
if (isalphanum(str) && isdigit(str[0])){
int iterate = 0;
string num;
string t;
while (iterate < str.length()) {
if (isalpha(str[iterate])) {
num += str[iterate];
} else {
t += str[iterate];
}
iterate++;
}
temp.push_back(t);
temp.push_back(num);
temp = split(str , " ");
cout << "lexeme: |" + temp[0] + "| length:" + to_string(temp[0].size()) + " token: " << getTokenID(temp[0]) << endl;
cout << "lexeme: |" + temp[1] + "| length:" + to_string(temp[1].size()) + " token: " << getTokenID(temp[1]) << endl;
return;
}else if(!isalphanum(str)){
// int index = 0;
}
else{
cout << "lexeme: |" + str + "| length:" + to_string(str.size()) + " token: " << getTokenID(str) << endl;
}
}else{
if(hasStrings) {
temp = split(str, "\"");
for (int i = 0; i < temp.size(); i++) {
if (i % 2 == 0) {
vector<string> hold = split(temp[i], " ");
for (int j = 0; j < hold.size(); j++) {
tokens.push_back(hold[j]);
}
} else {
if (temp[i][temp[i].length()] == '?' && temp[i][0] == '?') {
temp[i] = ReplaceAll(temp[i], "?", "\"");
tokens.push_back(temp[i]);
} else {
temp[i] = ReplaceAll(temp[i], "?", "\"");
tokens.push_back(temp[i]);
}
}
}
}else{
tokens = simpleSplit(str , " ");
// cout << "breakpt" << endl;
}
//Here we iterate and print out our results
for (int i = 0; i < tokens.size(); ++i) {
if(tokens[i] != " " && tokens.size() > 0) {
cout << "lexeme: |" + tokens[i] + "| length:" + to_string(tokens[i].size()) + " token: "
<< getTokenID(tokens[i]) << endl;
}
}
}
}
bool contain(string str , char token){
for (char i : str) {
if(i == token){
return true;
}
}
}
vector<string> simpleSplit(string str, string token) {
vector<string> result;
vector<string> finalResults;
while (str.size()) {
int index = str.find(token);
if (index != string::npos) {
result.push_back(str.substr(0, index));
str = str.substr(index + token.size());
if (str.size() == 0)result.push_back(str);
} else {
result.push_back(str);
str = "";
}
}
return result;
}
vector<string> split(string str, string token){
vector<string>result;
vector<string> finalResults;
while(str.size()){
int index = str.find(token);
if(index!=string::npos){
result.push_back(str.substr(0,index));
str = str.substr(index+token.size());
if(str.size()==0)result.push_back(str);
}else{
result.push_back(str);
str = "";
}
}
//clean out
for (int i = 0; i < result.size();i++){
if (!result[i].empty() && result[i] != " " && result[i].length() > 0){
//Weird cases like 123abc
if(isalphanum(result[i]) && isdigit(result[i][0])){
int iterate = 0;
string num;
string t;
while (iterate < result[i].length()) {
if (isalpha(result[i][iterate])) {
num += result[i][iterate];
} else {
t += result[i][iterate];
}
iterate++;
}
finalResults.push_back(t);
finalResults.push_back(num);
}else if(isalphanum(result[i])){
for(int i2 = 0; i2 < result[i].length(); i2++){
string tmp ="";
if (!isalnum(result[i][i2]) && isOperator(result[i][i2]))
if(tmp.length() < 2) {
tmp += result[i][i2];
}
finalResults.push_back(tmp);
tmp = "";
}
}else if(i != result.size() - 1){
if(result[i + 1][result[i + 1].length()] == '"'){
result[i] = result[i] + " " + result[i + 1];
result[i + 1] = " ";
finalResults.push_back(result[i]);
i++;
}
}
finalResults.push_back(result[i]);
}
}
vector<string> reclean;
for(int i = 0; i < finalResults.size(); i++){
if (finalResults[i].length() > 0 && finalResults[i].length() != ' '){
ReplaceAll(finalResults[i] , "?" , "");
reclean.push_back(finalResults[i]);
}
}
return reclean;
}
//This function handles encoding tokens.
string getTokenID(string str){
string id = ""; //our result will be stored here
//Here we create 2 arrays for each section, the 1st represents the value of our identifiers,
// the 2nd represents the respective encoding
vector<string> keywords = {"if","else","for","while","print","return","continue","break","debug","read","let"};
vector<string> keywordsEnc = {"1001","1002","1003","1004","1005","1006","1007","1008","1009","1010","1011"};
vector<string> datatypes = {"int" , "float" , "string"};
vector<string> datatypesEnc = {"1100" , "1101" , "1102"};
vector<string> punctuations = {";" , "(" , ")" , "[" , "]" , "{" , "}" , ","};
vector<string> punctuationsEnc = {"2000" , "2001" , "2002" , "2003" , "2004" , "2005" , "2006" , "2007"};
vector<string> operators = {"+" , "-" , "*" , "/" , ":=" , "==" , "<" , ">" , "<>" , "and" , "or" , "not" , "length"};
vector<string> operatorsEnc = {"3000" , "3001" , "3002" , "3003" , "3004" , "3005" , "3006" , "3007" , "3008" , "3009" , "3010" , "3011" , "3012"};
vector<string> abstractions = {"identifier" , "integer literal" , "floating-point literal" , "End of file" , "Unknown lexeme"};
vector<string> abstractionsEnc = {"4000" , "4001" , "4002" , "4003" , "5000" , "6000"};
//Now we run through and determine where our cases match.
for (int i = 0; i < keywords.size(); ++i) {
if (str == keywords[i]){
id = keywordsEnc[i];
return id;
}
}
for (int i = 0; i < datatypes.size(); ++i) {
if (str == datatypes[i]){
id = datatypesEnc[i];
return id;
}
}
for (int i = 0; i < punctuations.size(); ++i) {
if (str == punctuations[i]){
id = punctuationsEnc[i];
return id;
}
}
for (int i = 0; i < operators.size(); ++i) {
if (str == operators[i]){
id = operatorsEnc[i];
return id;
}
}
for (int i = 0; i < abstractions.size(); ++i) {
if (str == abstractions[i]){
id = abstractionsEnc[i];
return id;
}
}
//Special conditions for strings, decimals and integers are handled below
if(id == "") {
if (str[0] == '"' && str[str.length()] == '"'){
id = "4003";
return id;
} else if (is_number(str)){
id = "4001";
return id;
}else if(is_decimal(str)) {
id = "4002";
return id;
}else if(str == toUpper(str)){
if(str == toLower(str) && str[0] != EOF){
id = "6000";
return id;
}else{
id = "5001";
return id;
}
}else{
id = "4000";
return id;
}
}
return id;
}
string toLower(string str)
{
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
return str;
}
bool isalphanum(string str){
int i = 0;
while(i < str.length()){
return isalnum(str[i]) != 0;
}
}
string toUpper(string str)
{
std::transform(str.begin(), str.end(), str.begin(), ::toupper);
return str;
}
//Checks to see if s is an integer
bool is_number(string s)
{
string::const_iterator it = s.begin();
while (it != s.end() && std::isdigit(*it)) ++it;
return !s.empty() && it == s.end();
}
//Combines all in x and returns as one string
string merge(vector<string> x){
string ans;
for (int i = 0; i < x.size(); ++i) {
ans += x[i];
}
}
//Checks to see if str is a decimal or float
static bool is_decimal(string str){
string::const_iterator it = str.begin();
bool decimalPoint = false;
int minSize = 0;
if(str.size()>0 && (str[0] == '-' || str[0] == '+')){
it++;
minSize++;
}
while(it != str.end()){
if(*it == '.'){
if(!decimalPoint) decimalPoint = true;
else break;
}else if(!isdigit(*it) && ((*it!='f') || it+1 != str.end() || !decimalPoint)){
break;
}
++it;
}
return str.size()>minSize && it == str.end();
}
string ReplaceAll(string str, string from, string to) {
size_t start_pos = 0;
while((start_pos = str.find(from, start_pos)) != std::string::npos) {
str.replace(start_pos, from.length(), to);
start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
}
return str;
}
bool isOperator(char str){
string operators[] = {"+", "-", "/", "*", "%", "^", ">", "<"};
for (int i = 0; i < operators->length(); ++i) {
if (str == operators[i][0]){
return true;
}
}
}
vector<string> operatorExtractor(string str){
vector<string> ans;
string temp;
int index = 0;
while (index < str.length()) {
if (isOperator(str[index])) {
temp += str[index];
if(isOperator(str[index + 1] && str[index + 1] != '<' && str[index + 1] != '>')){
temp += str[index + 1];
ans.push_back(temp);
temp = "";
}
}
}
}
行,方法是建立索引:
NaN
例如,要摆脱以下数据框中第3列中的df = df[df['column_to_check'].notnull()]
值:
NaN
答案 1 :(得分:0)
pd.isnull()
和pd.notnull()
是检查单个null值的标准方法,如果您要按照上面的代码中的建议逐行迭代DataFrame并按列索引。然后,您可以使用该表达式对该值进行任何操作。
示例:
import pandas as pd
import numpy as np
a = np.nan
pd.isnull(a)
Out[4]: True
pd.notnull(a)
Out[5]: False
如果要操作DataFrame中的所有(或某些)NaN值,则在处理表格数据时,处理丢失的数据是一个大主题,并且有许多方法可以做到这一点。我建议this book中的第7章。这里是它的内容:
第一部分与您的问题最相关。
答案 2 :(得分:0)
如果您只想排除缺失的值,则可以使用pd.DataFrame.dropna()
下面是一个基于@sacul描述的示例:
>>> import pandas as pd
>>> df
0 1 2 3 4
0 0.0 1.0 NaN 1.0 1.0
1 1.0 NaN 1.0 1.0 1.0
2 NaN NaN NaN NaN NaN
3 NaN 1.0 1.0 NaN NaN
4 1.0 NaN NaN 1.0 1.0
>>> df.dropna(axis=0, subset=['3'])
0 1 2 3 4
0 0.0 1.0 NaN 1.0 1.0
1 1.0 NaN 1.0 1.0 1.0
4 1.0 NaN NaN 1.0 1.0
axis=0
表示排除了包含NaN
的行。subset=['3']
表示仅考虑列“ 3”。有关详细信息,请参见上面的链接。