比较两个电子表格并使用谷歌应用脚​​本

时间:2017-06-06 13:53:07

标签: google-apps-script google-sheets

好吧,我试图做标题中描述的内容。两个电子表格只有一张是我比较的表格。一个电子表格是另一个的更新,所以我试图只获取新内容。 (如果它是fc(dos命令)之类的函数,这很容易......)

在做了一些搜索之后,我有了下面的脚本,它应该适用于大多数情况,每个工作表使用数组。

function test() {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  var newarray = getNewData(array,old_array);
  Logger.log('there are ' + newarray.length + 'different rows');
}

function getNewData(array1,array2){    
  var diff =array2;   
  for (var i = 0; i<array1.length; i++){
    var duplicate = false;
    for (var j = 0;j<diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        var duplicate= true;
        break;
      }    
    }
    if (duplicate==false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
  }
  return diff;
}

问题是文件太大,差不多有30000行,因此脚本超出了5分钟的执行限制。

有没有办法改善这一点,例如,消除内部for循环? 或者有部分方法可以做到这一点?比如前5000行,依此类推。

此致

编辑:在稍微分析电子表格后,我发现每行都有一个ID,所以现在我只能将搜索集中在每个电子表格的一列中。所以这是我的新实现:

function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); 
  //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
  var searchString = 'NAME';
  for (var i = 0; i < old_array.length; i++) {    
    for (var j = 0; j < old_array[i].length; j++) {    
      if (old_array[i][j] == searchString) {
        var Row_old = i+1;
        var Column_old = j;
        break;
      }      
    }
    if (Row_old != undefined){
      break;
    }
  }
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }

  Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);  

  var diff_index =[];
  var row_ind = 0;  
  for (var i=Row;i<array.length;i++){        
    Logger.log(i);
    var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
    if (existe==-1){      
      Logger.log(row_ind+'!!!');
      diff_index[row_ind]=i;
      row_ind++;          
    }
  }
  Logger.log(diff_index);
}

这仍然没有时间......我现在将尝试合并你的评论。

4 个答案:

答案 0 :(得分:1)

你的脚本有一些主要的瓶颈,可以大大减慢它的速度:

  • 每次启动两个循环都会使其运行时爆炸
  • 每次找到重复时拼接都需要移动数组
  • 字符串在每次迭代时连接一个数组

我们可以通过以下方式规避这些问题:

  • 对第二个范围进行排序
  • 我确定通过迭代二进制搜索每一列来做一些聪明的事情,但我们每次都必须求助,以便我们对第一列进行二进制搜索,然后做一个线性搜索。

我们将使用ArrayLib进行排序(我希望它是一种快速排序算法)。

让我们从一个函数开始,找到第一列与值匹配的第一行(当前行的第一列):

function firstRowMatchingCol1(target, lookupRange) {
  var min = 0;
  var max = lookupRange.length - 1;
  var guess;
  var guessVal;

  while(min <= max) {
    guess = (min + max) / 2 | 0;
    guessVal = lookupRange[guess][0];

    if (guessVal < target) {
      min = guess + 1;
    } else if (guessVal > target) {
      max = guess - 1; 
    } else {
      while (guess > 0 && lookupRange[guess - 1][0] === target) {
        guess -= 1; 
      }
      return guess;
    }
  }
  return -1;
}

现在我们可以线性地遍历每一行并检查列是否匹配,直到第一列不再匹配为止。

function matchExists(row, lookupRange) {
  var index = firstRowMatchingCol1(row[0], lookupRange); 
  if (index === -1) {return false;}

  while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
    for (var col = 1; col < row.length; col++) {
      if (row[col] !== lookupRange[index][col]) {break;}
      if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
    }
    index += 1;
  }
  return false;  
}

最后我们可以得到这样的重复:

function getNonDuplicates(r1, r2) {
  r2 = ArrayLib.sort(r2, 0, true);  
  return r1.filter(function(row) {return !matchExists(row, r2);});
}

像mTorres&#39;这是未经测试的代码

答案 1 :(得分:0)

我提出的解决方案是&#34; hack&#34;大约是时间限制。但是如果你想要一个更干净的解决方案,你可以,如果可能的话,通过以某种方式订购数组来重新组织并使代码更高效。

你没有在array1和array2中指定数据,如果行有某种ID字段你可以通过这个ID排序并检查array1上的第i行和array2上的第i行而不是比较array1中的每一行array2中的每一行(30000行效率极低)。

如果您的数据没有用于对行进行排序的ID字段,那么您可以根据我提出的解决方案做些什么:为array1上的每个比较行添加一个轨道。当运行达到时间限制时,你再次运行该函数,但是从最后一个比较行开始(你知道哪个是因为你将跟踪比较的行),当第二个运行超时你重复时,等等。

每当你进行比较时,你会问它是否是第一次运行(或者使用布尔值 - 我更愿意问用户,这样你就不会忘记改变布尔值),如果它和&#39 #39;首次运行时,删除跟踪 如果它不是第一次运行,那么您将从最后一个跟踪行的下一个开始,所以基本上会继续您的脚本结束。我一直在使用这种技术,效果很好。

在代码中(未经测试,因此在使用真实数据运行之前检查一下):

/**
 * Only checks if it's the first run and calls the real work function
 */
function test() {
  var firstRun = "yes" === Browser.msgBox("Question", "Is this the first run?", Browser.Buttons.YES_NO);
  doTest(firstRun);
}

/**
 * Gets the data of the 2 spreadsheets and also the starting
 * row
 */
function doTest(firstRun) {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId = Folder.getId();

  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();

  /**
   * Here is the code to create the tracking hability
   */
  var strartFromRow = 0; // 0 because row 1 is array 0 index when you getValues();
  var trackSheet = old_spreadsheet.getSheetByName("Tracking");
  if (trackSheet === null) {
    trackSheet = old_spreadsheet.insertSheet("Tracking");
  }

  if (firstRun) {
    trackSheet.getRange("A:A").clearContent();   // make sure there no row is tracked yet
  }
  else {
    // we have to continue from the previous row, keep in mind you're making the comparison 
    // with array which is 0 based, but sheet is 1 based, but you want the next one so getLasRow()
    // should be the first item to compare on your array
    strartFromRow = trackSheet.getLastRow(); 
  }

  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);

  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  

  // when you call the DIFF function, pass the tracking sheet and the start Row
  var newarray = getNewData(array,old_array, trackSheet, startFromRow);

  Logger.log('there are ' + newarray.length + 'different rows');
}

/**
 * Creates a diff array using array1 and array2
 * It marks each element on array1 once it has checked if it's in array2
 */
function getNewData(array1, array2, trackingSheet, startFromRow){
  var logRow = trackingSheet.getLastRow();
  var diff = array2;   
  for (var i = startFromRow; i < array1.length; i++){
    var duplicate = false;
    for (var j = 0; j < diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        duplicate = true;
        break;
      }    
    }
    if (duplicate === false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
    trackingSheet.getRange(logRow++, 1).setValue("Checked!");  // Mark i row as checked
  }
  return diff;
}

答案 2 :(得分:0)

最后,我决定选择缓存服务选项,这是代码,我正在测试它,看看我是否继续使用它。

function getNewData() {
  //deleting triggers
  var triggers = ScriptApp.getProjectTriggers();    
  for (var i = 0; i < triggers.length; i++) {
    if (triggers[i].getHandlerFunction()=='getNewData'){
      ScriptApp.deleteTrigger(triggers[i]);
    }
  }  
  //max running time = 5.5 min
  var MAX_RUNNING_TIME = 330000;
  var startTime= (new Date()).getTime();
  //get cache
  var cache = CacheService.getUserCache();
  var downloaded =JSON.parse(cache.get('downloaded'));  
  var compared =JSON.parse(cache.get('compared'));
  //start
  if (downloaded==1 && compared!=1){
    //folder
    var Folder = DriveApp.getFoldersByName('theFolder').next();
    var FolderId = licitacionesFolder.getId();
    //call old_spreadsheet
    var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var old_file = files.next();  
    var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var old_sheet = old_spreadsheet.getSheets()[0];
    var old_array = old_sheet.getDataRange().getValues();
    //call spreadsheet
    var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var file = files.next();  
    var spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var sheet = spreadsheet.getSheets()[0];
    var array = sheet.getDataRange().getValues();
    Logger.log(array.length+'::'+old_array.length); 
    // Column
    var searchString = 'NAME';
    var RC = getColumn(array,searchString);    
    var Row = RC.Row;
    var Column = RC.Column;
    var RC = getColumn(old_array,searchString);    
    var Row_old = RC.Row;
    var Column_old = RC.Column;    
    Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);      
    //compare 
    var diff_index =JSON.parse(cache.get('diff_index'));
    var row_ind =JSON.parse(cache.get('row_ind'));  
    var Roww =JSON.parse(cache.get('Row'));  
    if (diff_index==null){var diff_index = [];}
    if (row_ind==null){var row_ind = 0;}
    if (Roww==null){var Roww = Row;}    
    Logger.log(row_ind+'\n'+Roww);    
    for (var i=Roww;i<array.length;i++){  
      var currTime = (new Date()).getTime();
      if(currTime - startTime >= MAX_RUNNING_TIME){
        Logger.log((currTime - startTime)/(1000*60));
        Logger.log(i+'::'+row_ind);
        cache.putAll({'diff_index': JSON.stringify(diff_index),'row_ind': JSON.stringify(row_ind),'Row': JSON.stringify(i-1)},21600);   
        ScriptApp.newTrigger('getNewData').timeBased().after(2 * 60 * 1000).create();
        return;
      } else {
        Logger.log(i);
        var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
        if (existe==-1){      
          Logger.log(row_ind+'!!!');
          diff_index[row_ind]=i;
          row_ind++;          
        }
      }
    }    
    cache.putAll({'diff_index': JSON.stringify(diff_index),'Row': JSON.stringify(Row),'compared': JSON.stringify(1)},21600);
  } else {
    Logger.log('file not downloaded yet or already compared');
  }
}

function getColumn(array,searchString){
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }
  return {Row: Row, Column: Column};
}

答案 3 :(得分:0)

这是一个可以绕过时间限制的替代解决方案。创建新的专用电子表格以及自定义侧边栏。侧边栏将要求您创建一些HTML,最终将在客户端的iframe中嵌入和呈现。您可以通过脚本标记将纯JavaScript嵌入到HTML中。

这种方法的优点在于这些脚本不会在服务器端运行,而是在客户端上独立于Google Apps脚本的服务器端环境运行,并且不受6分钟限制。此外,他们还可以在您的Google脚本中调用功能。因此,一种方法是让客户端脚本调用Google Script函数来检索必需的数据,在客户端脚本中执行所有繁重的处理,然后将结果发送回服务器端脚本以更新片材。

以下是设置自定义侧边栏的链接,以帮助您入门: https://developers.google.com/apps-script/guides/dialogs#custom_sidebars