package hw.bst;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Scanner;

/**
 * Simple text indexer that builds a BST dictionary mapping words to their positions.
 *
 * @author Claude Sonnet 4.5
 * @version 11/2025
 */
public class TextIndexer {

  /**
   * Read a text file and build an index of word positions.
   * Words are converted to lowercase and only alphabetic characters are kept.
   *
   * @param filename The path to the text file
   * @throws FileNotFoundException if the file doesn't exist
   */
  public static BSTDictionary<String, ArrayList<Integer>> indexFile(String filename) throws FileNotFoundException {
    BSTDictionary<String, ArrayList<Integer>> index = new BSTDictionary<>();
    Scanner scanner = new Scanner(new File(filename));
    int position = 0;

    while (scanner.hasNext()) {
      String word = scanner.next();
      // Simple tokenization: remove non-alphabetic chars and convert to lowercase
      word = word.replaceAll("[^a-zA-Z]", "").toLowerCase();

      if (!word.isEmpty()) {
        ArrayList<Integer> positions = index.find(word);
        if (positions == null) {
          positions = new ArrayList<>();
          index.insert(word, positions);
        }
        positions.add(position);
        position++;
      }
    }
    scanner.close();
    return index;
  }

}
