如何使用熊猫识别CVS文件中的空单元格

时间:2018-10-01 19:10:42

标签: python pandas file csv file-io

我正在从csv文件中获取一列,并使用pandas将其中的数据输入到数组中。但是,许多单元格为空,并以“ nan”的形式保存在数组中。我想识别空单元格,以便跳过它们或将它们全部从阵列中删除。类似于以下的伪代码:

if df.row(column number) == nan
    skip

if df.row(column number) != nan
    do stuff

基本上,我如何确定csv文件中的单元格是否为空。

3 个答案:

答案 0 :(得分:0)

最好是在加载后摆脱掉#include <stdlib.h> #include <vector> #include <string> #include <cstring> #include <iostream> #include <algorithm> using namespace std; void lexEme(string str); string getTokenID(string str); vector <string> tokenize(string str); string ReplaceAll(string str, string from, string to); bool is_number(string s); bool isalphanum(string s); bool isOperator(char str); vector<string> split(string str, string token); vector<string> simpleSplit(string str, string token); static bool is_decimal(string str); string merge(vector<string> x); string toLower(string str); string toUpper(string str); bool contain(string str , char token); vector<string> betweenQuotes(string str); //Store alphabet we are matching int main(){ cout<< "Enter string" << endl; string x; vector<string> in; while (getline(cin , x)){ if(x.empty()){ break; } in.push_back(x); } for(int i = 0;i < in.size(); i++){ lexEme(in[i]); } cout << "done" << endl; return 0; } //This handles the lexical analysis void lexEme(string str){ //We store our broken up string in here vector<string> tokens; //We handle some preparsing to avoid potential errors when reading lines with ( and ) str = ReplaceAll(str, "(", " ( "); str = ReplaceAll(str, ")", " ) "); str = ReplaceAll(str , "\n" , " "); bool base = true; bool hasStrings = false; vector<string> temp; //We assign tokens to our tokenized string // str = ReplaceAll(str , "\"" , "?\"" ); for (int k = 0; k < str.length(); ++k) { if(str[k] == '"') hasStrings = true; if(str[k] == ' ') base = false; } if(base){ // cout << "this is base" << endl; if (isalphanum(str) && isdigit(str[0])){ int iterate = 0; string num; string t; while (iterate < str.length()) { if (isalpha(str[iterate])) { num += str[iterate]; } else { t += str[iterate]; } iterate++; } temp.push_back(t); temp.push_back(num); temp = split(str , " "); cout << "lexeme: |" + temp[0] + "| length:" + to_string(temp[0].size()) + " token: " << getTokenID(temp[0]) << endl; cout << "lexeme: |" + temp[1] + "| length:" + to_string(temp[1].size()) + " token: " << getTokenID(temp[1]) << endl; return; }else if(!isalphanum(str)){ // int index = 0; } else{ cout << "lexeme: |" + str + "| length:" + to_string(str.size()) + " token: " << getTokenID(str) << endl; } }else{ if(hasStrings) { temp = split(str, "\""); for (int i = 0; i < temp.size(); i++) { if (i % 2 == 0) { vector<string> hold = split(temp[i], " "); for (int j = 0; j < hold.size(); j++) { tokens.push_back(hold[j]); } } else { if (temp[i][temp[i].length()] == '?' && temp[i][0] == '?') { temp[i] = ReplaceAll(temp[i], "?", "\""); tokens.push_back(temp[i]); } else { temp[i] = ReplaceAll(temp[i], "?", "\""); tokens.push_back(temp[i]); } } } }else{ tokens = simpleSplit(str , " "); // cout << "breakpt" << endl; } //Here we iterate and print out our results for (int i = 0; i < tokens.size(); ++i) { if(tokens[i] != " " && tokens.size() > 0) { cout << "lexeme: |" + tokens[i] + "| length:" + to_string(tokens[i].size()) + " token: " << getTokenID(tokens[i]) << endl; } } } } bool contain(string str , char token){ for (char i : str) { if(i == token){ return true; } } } vector<string> simpleSplit(string str, string token) { vector<string> result; vector<string> finalResults; while (str.size()) { int index = str.find(token); if (index != string::npos) { result.push_back(str.substr(0, index)); str = str.substr(index + token.size()); if (str.size() == 0)result.push_back(str); } else { result.push_back(str); str = ""; } } return result; } vector<string> split(string str, string token){ vector<string>result; vector<string> finalResults; while(str.size()){ int index = str.find(token); if(index!=string::npos){ result.push_back(str.substr(0,index)); str = str.substr(index+token.size()); if(str.size()==0)result.push_back(str); }else{ result.push_back(str); str = ""; } } //clean out for (int i = 0; i < result.size();i++){ if (!result[i].empty() && result[i] != " " && result[i].length() > 0){ //Weird cases like 123abc if(isalphanum(result[i]) && isdigit(result[i][0])){ int iterate = 0; string num; string t; while (iterate < result[i].length()) { if (isalpha(result[i][iterate])) { num += result[i][iterate]; } else { t += result[i][iterate]; } iterate++; } finalResults.push_back(t); finalResults.push_back(num); }else if(isalphanum(result[i])){ for(int i2 = 0; i2 < result[i].length(); i2++){ string tmp =""; if (!isalnum(result[i][i2]) && isOperator(result[i][i2])) if(tmp.length() < 2) { tmp += result[i][i2]; } finalResults.push_back(tmp); tmp = ""; } }else if(i != result.size() - 1){ if(result[i + 1][result[i + 1].length()] == '"'){ result[i] = result[i] + " " + result[i + 1]; result[i + 1] = " "; finalResults.push_back(result[i]); i++; } } finalResults.push_back(result[i]); } } vector<string> reclean; for(int i = 0; i < finalResults.size(); i++){ if (finalResults[i].length() > 0 && finalResults[i].length() != ' '){ ReplaceAll(finalResults[i] , "?" , ""); reclean.push_back(finalResults[i]); } } return reclean; } //This function handles encoding tokens. string getTokenID(string str){ string id = ""; //our result will be stored here //Here we create 2 arrays for each section, the 1st represents the value of our identifiers, // the 2nd represents the respective encoding vector<string> keywords = {"if","else","for","while","print","return","continue","break","debug","read","let"}; vector<string> keywordsEnc = {"1001","1002","1003","1004","1005","1006","1007","1008","1009","1010","1011"}; vector<string> datatypes = {"int" , "float" , "string"}; vector<string> datatypesEnc = {"1100" , "1101" , "1102"}; vector<string> punctuations = {";" , "(" , ")" , "[" , "]" , "{" , "}" , ","}; vector<string> punctuationsEnc = {"2000" , "2001" , "2002" , "2003" , "2004" , "2005" , "2006" , "2007"}; vector<string> operators = {"+" , "-" , "*" , "/" , ":=" , "==" , "<" , ">" , "<>" , "and" , "or" , "not" , "length"}; vector<string> operatorsEnc = {"3000" , "3001" , "3002" , "3003" , "3004" , "3005" , "3006" , "3007" , "3008" , "3009" , "3010" , "3011" , "3012"}; vector<string> abstractions = {"identifier" , "integer literal" , "floating-point literal" , "End of file" , "Unknown lexeme"}; vector<string> abstractionsEnc = {"4000" , "4001" , "4002" , "4003" , "5000" , "6000"}; //Now we run through and determine where our cases match. for (int i = 0; i < keywords.size(); ++i) { if (str == keywords[i]){ id = keywordsEnc[i]; return id; } } for (int i = 0; i < datatypes.size(); ++i) { if (str == datatypes[i]){ id = datatypesEnc[i]; return id; } } for (int i = 0; i < punctuations.size(); ++i) { if (str == punctuations[i]){ id = punctuationsEnc[i]; return id; } } for (int i = 0; i < operators.size(); ++i) { if (str == operators[i]){ id = operatorsEnc[i]; return id; } } for (int i = 0; i < abstractions.size(); ++i) { if (str == abstractions[i]){ id = abstractionsEnc[i]; return id; } } //Special conditions for strings, decimals and integers are handled below if(id == "") { if (str[0] == '"' && str[str.length()] == '"'){ id = "4003"; return id; } else if (is_number(str)){ id = "4001"; return id; }else if(is_decimal(str)) { id = "4002"; return id; }else if(str == toUpper(str)){ if(str == toLower(str) && str[0] != EOF){ id = "6000"; return id; }else{ id = "5001"; return id; } }else{ id = "4000"; return id; } } return id; } string toLower(string str) { std::transform(str.begin(), str.end(), str.begin(), ::tolower); return str; } bool isalphanum(string str){ int i = 0; while(i < str.length()){ return isalnum(str[i]) != 0; } } string toUpper(string str) { std::transform(str.begin(), str.end(), str.begin(), ::toupper); return str; } //Checks to see if s is an integer bool is_number(string s) { string::const_iterator it = s.begin(); while (it != s.end() && std::isdigit(*it)) ++it; return !s.empty() && it == s.end(); } //Combines all in x and returns as one string string merge(vector<string> x){ string ans; for (int i = 0; i < x.size(); ++i) { ans += x[i]; } } //Checks to see if str is a decimal or float static bool is_decimal(string str){ string::const_iterator it = str.begin(); bool decimalPoint = false; int minSize = 0; if(str.size()>0 && (str[0] == '-' || str[0] == '+')){ it++; minSize++; } while(it != str.end()){ if(*it == '.'){ if(!decimalPoint) decimalPoint = true; else break; }else if(!isdigit(*it) && ((*it!='f') || it+1 != str.end() || !decimalPoint)){ break; } ++it; } return str.size()>minSize && it == str.end(); } string ReplaceAll(string str, string from, string to) { size_t start_pos = 0; while((start_pos = str.find(from, start_pos)) != std::string::npos) { str.replace(start_pos, from.length(), to); start_pos += to.length(); // Handles case where 'to' is a substring of 'from' } return str; } bool isOperator(char str){ string operators[] = {"+", "-", "/", "*", "%", "^", ">", "<"}; for (int i = 0; i < operators->length(); ++i) { if (str == operators[i][0]){ return true; } } } vector<string> operatorExtractor(string str){ vector<string> ans; string temp; int index = 0; while (index < str.length()) { if (isOperator(str[index])) { temp += str[index]; if(isOperator(str[index + 1] && str[index + 1] != '<' && str[index + 1] != '>')){ temp += str[index + 1]; ans.push_back(temp); temp = ""; } } } } 行,方法是建立索引:

NaN

例如,要摆脱以下数据框中第3列中的df = df[df['column_to_check'].notnull()] 值:

NaN

答案 1 :(得分:0)

pd.isnull()pd.notnull()是检查单个null值的标准方法,如果您要按照上面的代码中的建议逐行迭代DataFrame并按列索引。然后,您可以使用该表达式对该值进行任何操作。

示例:

import pandas as pd

import numpy as np

a = np.nan

pd.isnull(a)
Out[4]: True

pd.notnull(a)
Out[5]: False

如果要操作DataFrame中的所有(或某些)NaN值,则在处理表格数据时,处理丢失的数据是一个大主题,并且有许多方法可以做到这一点。我建议this book中的第7章。这里是它的内容:

enter image description here

第一部分与您的问题最相关。

答案 2 :(得分:0)

如果您只想排除缺失的值,则可以使用pd.DataFrame.dropna()

下面是一个基于@sacul描述的示例:

>>> import pandas as pd

>>> df

     0    1    2    3    4
0  0.0  1.0  NaN  1.0  1.0
1  1.0  NaN  1.0  1.0  1.0
2  NaN  NaN  NaN  NaN  NaN
3  NaN  1.0  1.0  NaN  NaN
4  1.0  NaN  NaN  1.0  1.0

>>> df.dropna(axis=0, subset=['3'])

     0    1    2    3    4
0  0.0  1.0  NaN  1.0  1.0
1  1.0  NaN  1.0  1.0  1.0
4  1.0  NaN  NaN  1.0  1.0
  • axis=0表示排除了包含NaN的行。
  • subset=['3']表示仅考虑列“ 3”。

有关详细信息,请参见上面的链接。