Drupal 6: Using Tika Java library to index WebFM file attachments as Apache Solr documents

In this quick snippet, I’ll show some code that uses the Tika Java library to index the content in WebFM file attachments and add the data as Apache Solr documents. This code is designed to work with the Apache Solr Search Integration Drupal module, and piggyback off of the Apache Solr Attachments module. Out of the box, the Apache Solr Attachments module can index CCK file fields and node file attachments, but the WebFM module uses its own custom file tables and therefore the files are not indexed. This code assumes you have the Tika library already integrated with your Solr installation. Please review the Tika Getting Started documentation for more information.

<?php

/**
 * Implements hook_update_index()
 */
function index_webfm_update_index() {

  $cron_limit = 10;
  $rows = apachesolr_get_nodes_to_index('index_webfm', $cron_limit);
  $success = apachesolr_index_nodes($rows, 'index_webfm', '_index_webfm_add_documents');

}

/**
 * Implements custom function to add webfm file attachments content as Solr documents
 */
function _index_webfm_add_documents(&$documents, $nid, $namespace = 'index_webfm') {
  if ($namespace != 'index_webfm') {
    return;
  }

  // load node
  $node = node_load($nid);
  if (!is_object($node)) {
    return;
  }

  // get webfm fids
  $sql = "
    select a.fid, f.fpath as `filepath`, f.fmime as `filemime`, f.fcreatedate as `timestamp`
    from {webfm_attach} a
    join {webfm_file} f on f.fid = a.fid
    where a.nid = %d
    order by a.weight
  ";
  $resource = db_query($sql, $node->nid);
  $webfm_data = array();
  while ($row = db_fetch_object($resource)) {
    $webfm_data[] = $row;
  }

  // ensure webfm files exists for node
  if (!count($webfm_data)) {
    return;
  }

  // loop through web fm data, collect data to add to document
  $tika_data = NULL;
  foreach ($webfm_data as $file) {

    // direct tika
    if (variable_get('apachesolr_attachment_extract_using', 'tika') == 'tika') {

      if (function_exists('apachesolr_attachments_extract_using_tika')) {
        $tika_data = apachesolr_attachments_extract_using_tika($file->filepath);
      }

    }
    // tika via solr
    else {

      if (function_exists('apachesolr_attachments_extract_using_solr')) {
        list($tika_data, $metadata) = apachesolr_attachments_extract_using_solr($file->filepath);
      }

    }
    if (!$tika_data || !is_string($tika_data)) {
      continue;
    }

    // create new Solr document
    $document = new Apache_Solr_Document();
    $document->id = apachesolr_document_id($file->fid .'-'. $node->nid, 'file');
    $document->url = file_create_url($file->filepath);
    $document->path = $file->filepath;
    $document->hash = apachesolr_site_hash();
    $document->entity = 'file';
    $document->site = url(NULL, array('absolute' => TRUE));
    $document->nid = $node->nid;
    $document->title = basename($file->filepath);
    $document->created = apachesolr_date_iso($file->timestamp);
    $document->changed = $document->created;
    $document->status = $node->status;
    $document->sticky = $node->sticky;
    $document->promote = $node->promote;
    $document->uid = $node->uid;
    $document->name = $node->name;
    $document->body = apachesolr_clean_text(basename($file->filepath) . ' ' . $tika_data);
    $document->ss_filemime = $file->filemime;
    $document->ss_file_node_title = apachesolr_clean_text($node->title);
    $document->ss_file_node_url = url('node/' . $node->nid, array('absolute' => TRUE));

    // add new webfm document to documents
    $documents[] = $document;

  }
}
?>

After implementing this code, when nodes were set to be indexed by Solr, their webfm file attachments were separately processed, the content was extracted from the file attachments, and added as new Solr documents.

Updated: