添加Pdf读取mcp
This commit is contained in:
18
pdf-reader-mcp/.dockerignore
Normal file
18
pdf-reader-mcp/.dockerignore
Normal file
@@ -0,0 +1,18 @@
|
||||
# Git files
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Node modules
|
||||
node_modules
|
||||
|
||||
# Build artifacts (we only need the build output in the final stage)
|
||||
build
|
||||
|
||||
# Docker files
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
# Documentation / Other
|
||||
README.md
|
||||
memory-bank
|
||||
.vscode
|
||||
1
pdf-reader-mcp/.eslintcache
Normal file
1
pdf-reader-mcp/.eslintcache
Normal file
File diff suppressed because one or more lines are too long
19
pdf-reader-mcp/.gitattributes
vendored
Normal file
19
pdf-reader-mcp/.gitattributes
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# Auto detect text files and perform LF normalization
|
||||
* text=auto eol=lf
|
||||
|
||||
# Explicitly declare text files we want to always normalize to LF
|
||||
*.cjs text eol=lf
|
||||
*.js text eol=lf
|
||||
*.ts text eol=lf
|
||||
*.json text eol=lf
|
||||
*.md text eol=lf
|
||||
*.yaml text eol=lf
|
||||
*.yml text eol=lf
|
||||
|
||||
# Lockfiles should always use LF
|
||||
package-lock.json text eol=lf
|
||||
pnpm-lock.yaml text eol=lf
|
||||
|
||||
# Ensure specific files are treated as binary (if needed)
|
||||
# *.png binary
|
||||
# *.jpg binary
|
||||
14
pdf-reader-mcp/.gitignore
vendored
Normal file
14
pdf-reader-mcp/.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
node_modules/
|
||||
build/
|
||||
build/
|
||||
*.log
|
||||
coverage/
|
||||
.env*
|
||||
|
||||
# VitePress
|
||||
docs/.vitepress/dist
|
||||
docs/.vitepress/cache
|
||||
|
||||
# Test Reports
|
||||
test-report.junit.xml
|
||||
/dist
|
||||
15
pdf-reader-mcp/.prettierrc.cjs
Normal file
15
pdf-reader-mcp/.prettierrc.cjs
Normal file
@@ -0,0 +1,15 @@
|
||||
// .prettierrc.js
|
||||
module.exports = {
|
||||
printWidth: 100, // Specify the line length that the printer will wrap on.
|
||||
tabWidth: 2, // Specify the number of spaces per indentation-level.
|
||||
useTabs: false, // Indent lines with tabs instead of spaces.
|
||||
semi: true, // Print semicolons at the ends of statements.
|
||||
singleQuote: true, // Use single quotes instead of double quotes.
|
||||
quoteProps: 'as-needed', // Change when properties in objects are quoted.
|
||||
jsxSingleQuote: false, // Use single quotes instead of double quotes in JSX.
|
||||
trailingComma: 'es5', // Print trailing commas wherever possible in multi-line comma-separated syntactic structures. (A single-line array, for example, never gets trailing commas.)
|
||||
bracketSpacing: true, // Print spaces between brackets in object literals.
|
||||
bracketSameLine: false, // Put the > of a multi-line HTML (HTML, JSX, Vue, Angular) element at the end of the last line instead of being alone on the next line (does not apply to self closing elements).
|
||||
arrowParens: 'always', // Include parentheses around a sole arrow function parameter.
|
||||
endOfLine: 'lf', // Ensure consistent line endings
|
||||
};
|
||||
3
pdf-reader-mcp/.roo/mcp.json
Normal file
3
pdf-reader-mcp/.roo/mcp.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"mcpServers": {}
|
||||
}
|
||||
167
pdf-reader-mcp/CHANGELOG.md
Normal file
167
pdf-reader-mcp/CHANGELOG.md
Normal file
@@ -0,0 +1,167 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
||||
|
||||
### [0.3.24](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.23...v0.3.24) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- enable rootDir and adjust include for correct build structure ([a9985a7](https://github.com/sylphlab/pdf-reader-mcp/commit/a9985a7eed16ed0a189dd1bda7a66feb13aee889))
|
||||
|
||||
### [0.3.23](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.22...v0.3.23) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- correct executable paths due to missing rootDir ([ed5c150](https://github.com/sylphlab/pdf-reader-mcp/commit/ed5c15012b849211422fbb22fb15d8a2c9415b0b))
|
||||
|
||||
### [0.3.22](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.21...v0.3.22) (2025-04-07)
|
||||
|
||||
### [0.3.21](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.20...v0.3.21) (2025-04-07)
|
||||
|
||||
### [0.3.20](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.19...v0.3.20) (2025-04-07)
|
||||
|
||||
### [0.3.19](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.18...v0.3.19) (2025-04-07)
|
||||
|
||||
### [0.3.18](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.17...v0.3.18) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- **publish:** remove dist from gitignore and fix clean script ([305e259](https://github.com/sylphlab/pdf-reader-mcp/commit/305e259d6492fbc1732607ee8f8344f6e07aa073))
|
||||
|
||||
### [0.3.17](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.16...v0.3.17) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- **config:** align package.json paths with build output (dist/) ([ab1100d](https://github.com/sylphlab/pdf-reader-mcp/commit/ab1100d771e277705ef99cb745f89687c74a7e13))
|
||||
|
||||
### [0.3.16](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.15...v0.3.16) (2025-04-07)
|
||||
|
||||
### [0.3.15](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.14...v0.3.15) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- Run lint-staged in pre-commit hook ([e96680c](https://github.com/sylphlab/pdf-reader-mcp/commit/e96680c771eb99ba303fdf7ad51da880261e11c1))
|
||||
|
||||
### [0.3.14](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.13...v0.3.14) (2025-04-07)
|
||||
|
||||
### [0.3.13](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.12...v0.3.13) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- **docker:** Install pnpm globally in builder stage ([651d7ae](https://github.com/sylphlab/pdf-reader-mcp/commit/651d7ae06660b97af91c348bc8cc786613232c06))
|
||||
|
||||
### [0.3.11](https://github.com/sylphlab/pdf-reader-mcp/compare/v0.3.10...v0.3.11) (2025-04-07)
|
||||
|
||||
### [0.3.10](https://github.com/sylphlab/pdf-reader-mcp/compare/v1.0.0...v0.3.10) (2025-04-07)
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- address remaining eslint warnings ([a91d313](https://github.com/sylphlab/pdf-reader-mcp/commit/a91d313bec2b843724e62ea6a556d99d5389d6cc))
|
||||
- resolve eslint errors in tests and scripts ([ffc1bdd](https://github.com/sylphlab/pdf-reader-mcp/commit/ffc1bdd18b972f58e90e12ed2394d2968c5639d9))
|
||||
|
||||
## [1.0.0] - 2025-04-07
|
||||
|
||||
### Added
|
||||
|
||||
- **Project Alignment:** Aligned project structure, configuration (TypeScript, ESLint, Prettier, Vitest), CI/CD (`.github/workflows/ci.yml`), Git Hooks (Husky, lint-staged, commitlint), and dependency management (Dependabot) with Sylph Lab Playbook guidelines.
|
||||
- **Testing:** Achieved ~95% test coverage using Vitest.
|
||||
- **Benchmarking:** Implemented initial performance benchmarks using Vitest `bench`.
|
||||
- **Documentation:**
|
||||
- Set up documentation website using VitePress.
|
||||
- Created initial content for Guide, Design, Performance, Comparison sections.
|
||||
- Updated `README.md` to follow standard structure.
|
||||
- Added `CONTRIBUTING.md`.
|
||||
- Updated Performance page with initial benchmark results.
|
||||
- Added community links and call-to-action in VitePress config footer.
|
||||
- **Package Manager:** Switched from npm to pnpm.
|
||||
|
||||
### Changed
|
||||
|
||||
- **Dependencies:** Updated various dependencies to align with guidelines and ensure compatibility.
|
||||
- **Configuration:** Refined `tsconfig.json`, `eslint.config.js`, `vitest.config.ts`, `package.json` based on guidelines.
|
||||
- **Project Identity:** Updated scope to `@sylphlab`.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Resolved various configuration issues identified during guideline alignment.
|
||||
- Corrected Markdown parsing errors in initial documentation.
|
||||
- Addressed peer dependency warnings where possible.
|
||||
- **Note:** TypeDoc API generation is currently blocked due to unresolved initialization errors with TypeDoc v0.28.1.
|
||||
|
||||
### Removed
|
||||
|
||||
- Sponsorship related files and badges (`.github/FUNDING.yml`).
|
||||
|
||||
## [0.3.9] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed artifact download/extract steps from `publish-docker` job in workflow, as Docker build needs the full source context provided by checkout.
|
||||
|
||||
## [0.3.8] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed duplicate `context: .` entry in `docker/build-push-action` step in `.github/workflows/publish.yml`.
|
||||
|
||||
## [0.3.7] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed explicit `COPY tsconfig.json ./` from Dockerfile (rely on `COPY . .`).
|
||||
- Explicitly set `context: .` in docker build-push action.
|
||||
|
||||
## [0.3.6] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Explicitly added `COPY tsconfig.json ./` before `COPY . .` in Dockerfile to ensure it exists before build step.
|
||||
|
||||
## [0.3.5] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Added `RUN ls -la` before build step in Dockerfile to debug `tsconfig.json` not found error.
|
||||
|
||||
## [0.3.4] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Explicitly specify `tsconfig.json` path in Dockerfile build step (`RUN ./node_modules/.bin/tsc -p tsconfig.json`) to debug build failure.
|
||||
|
||||
## [0.3.3] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Changed Dockerfile build step from `RUN npm run build` to `RUN ./node_modules/.bin/tsc` to debug build failure.
|
||||
|
||||
## [0.3.2] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Simplified `build` script in `package.json` to only run `tsc` (removed `chmod`) to debug Docker build failure.
|
||||
|
||||
## [0.3.1] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Attempted various fixes for GitHub Actions workflow artifact upload issue (`Error: Provided artifact name input during validation is empty`). Final attempt uses fixed artifact filename in upload/download steps.
|
||||
|
||||
## [0.3.0] - 2025-04-05
|
||||
|
||||
### Added
|
||||
|
||||
- `CHANGELOG.md` file based on Keep a Changelog format.
|
||||
- `LICENSE` file (MIT License).
|
||||
- Improved GitHub Actions workflow (`.github/workflows/publish.yml`):
|
||||
- Triggers on push to `main` branch and version tags (`v*.*.*`).
|
||||
- Conditionally archives build artifacts only on tag pushes.
|
||||
- Conditionally runs `publish-npm` and `publish-docker` jobs only on tag pushes.
|
||||
- Added `create-release` job to automatically create GitHub Releases from tags, using `CHANGELOG.md` for the body.
|
||||
- Added version headers to Memory Bank files (`activeContext.md`, `progress.md`).
|
||||
|
||||
### Changed
|
||||
|
||||
- Bumped version from 0.2.2 to 0.3.0.
|
||||
|
||||
<!-- Note: Removed [0.4.0-dev] entry as changes are now part of 1.0.0 -->
|
||||
53
pdf-reader-mcp/CONTRIBUTING.md
Normal file
53
pdf-reader-mcp/CONTRIBUTING.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Contributing to PDF Reader MCP Server
|
||||
|
||||
Thank you for considering contributing! We welcome contributions from the community.
|
||||
|
||||
## How to Contribute
|
||||
|
||||
1. **Reporting Issues:** If you find a bug or have a feature request, please open an issue on GitHub.
|
||||
|
||||
- Provide a clear description of the issue.
|
||||
- Include steps to reproduce (for bugs).
|
||||
- Explain the motivation for the feature request.
|
||||
|
||||
2. **Submitting Pull Requests:**
|
||||
- Fork the repository.
|
||||
- Create a new branch for your feature or bugfix (e.g., `feature/new-pdf-feature` or `bugfix/parsing-error`).
|
||||
- Make your changes, adhering to the project's coding style and guidelines (ESLint, Prettier).
|
||||
- Add tests for your changes and ensure all tests pass (`npm test`).
|
||||
- Ensure your commit messages follow the Conventional Commits standard.
|
||||
- Push your branch to your fork.
|
||||
- Open a Pull Request against the `main` branch of the `sylphlab/pdf-reader-mcp` repository.
|
||||
- Provide a clear description of your changes in the PR.
|
||||
|
||||
## Development Setup
|
||||
|
||||
1. Clone the repository: `git clone https://github.com/sylphlab/pdf-reader-mcp.git`
|
||||
2. Navigate into the directory: `cd pdf-reader-mcp`
|
||||
3. Install dependencies: `npm install`
|
||||
4. Build the project: `npm run build`
|
||||
5. Run tests: `npm test`
|
||||
6. Use `npm run watch` during development for automatic recompilation.
|
||||
7. Use `npm run validate` before committing to check formatting, linting, and tests.
|
||||
|
||||
## Code Style
|
||||
|
||||
- We use Prettier for code formatting and ESLint (with strict TypeScript rules) for linting.
|
||||
- Please run `npm run format` and `npm run lint:fix` before committing your changes.
|
||||
- Git hooks are set up using Husky and lint-staged to automatically check staged files.
|
||||
|
||||
## Commit Messages
|
||||
|
||||
We follow the [Conventional Commits](https://www.conventionalcommits.org/) specification. Commit messages are linted using `commitlint` via a Git hook.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
feat: add support for encrypted PDFs
|
||||
|
||||
Implemented handling for password-protected PDF files using an optional password parameter.
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
By contributing, you agree that your contributions will be licensed under the MIT License that covers the project.
|
||||
45
pdf-reader-mcp/Dockerfile
Normal file
45
pdf-reader-mcp/Dockerfile
Normal file
@@ -0,0 +1,45 @@
|
||||
# Stage 1: Build the application
|
||||
FROM node:lts-alpine AS builder
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package files
|
||||
# Using package-lock.json ensures reproducible installs
|
||||
COPY package.json pnpm-lock.yaml ./
|
||||
|
||||
# Install ALL dependencies (including dev for build), ignore scripts for now
|
||||
RUN npm install -g pnpm
|
||||
|
||||
RUN pnpm install --frozen-lockfile
|
||||
|
||||
# Copy the rest of the application source code
|
||||
# This includes tsconfig.json and the src directory
|
||||
COPY . .
|
||||
|
||||
# Build the TypeScript project
|
||||
RUN ls -la
|
||||
RUN ./node_modules/.bin/tsc -p tsconfig.json
|
||||
# The build script already includes chmod +x for the output
|
||||
|
||||
# Remove development dependencies after build
|
||||
RUN pnpm prune --prod --ignore-scripts
|
||||
|
||||
# Stage 2: Create the final lightweight image
|
||||
FROM node:lts-alpine
|
||||
WORKDIR /app
|
||||
|
||||
# Create a non-root user and group for security
|
||||
# Running as non-root is a good practice
|
||||
RUN addgroup -S appgroup && adduser -S appuser -G appgroup
|
||||
|
||||
# Copy built artifacts and production dependencies from the builder stage
|
||||
COPY --from=builder --chown=appuser:appgroup /app/node_modules ./node_modules
|
||||
COPY --from=builder --chown=appuser:appgroup /app/dist ./dist
|
||||
# Copy package.json for metadata, might be useful for inspection
|
||||
COPY --from=builder --chown=appuser:appgroup /app/package.json ./
|
||||
|
||||
# Switch to the non-root user
|
||||
USER appuser
|
||||
|
||||
# Command to run the server using the built output
|
||||
# This will start the MCP server listening on stdio
|
||||
CMD ["node", "dist/index.js"]
|
||||
21
pdf-reader-mcp/LICENSE
Normal file
21
pdf-reader-mcp/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 SylphLab
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
56
pdf-reader-mcp/PLAN.md
Normal file
56
pdf-reader-mcp/PLAN.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Plan: PDF Reader MCP Tool Development
|
||||
|
||||
1. **Project Setup:**
|
||||
|
||||
- Clone `https://github.com/shtse8/filesystem-mcp` to
|
||||
`c:/Users/shtse/pdf-reader`. (Already done implicitly by user starting in
|
||||
this empty dir, but good to note).
|
||||
- Initialize Git and push to `https://github.com/shtse8/pdf-reader-mcp.git`.
|
||||
(User has done this).
|
||||
- Create Memory Bank directory and core files:
|
||||
- `memory-bank/projectbrief.md`
|
||||
- `memory-bank/productContext.md`
|
||||
- `memory-bank/activeContext.md`
|
||||
- `memory-bank/systemPatterns.md`
|
||||
- `memory-bank/techContext.md`
|
||||
- `memory-bank/progress.md`
|
||||
|
||||
2. **Technology Selection & Dependency:**
|
||||
|
||||
- Research and choose a suitable Node.js PDF processing library (e.g.,
|
||||
`pdf-parse` or `pdfjs-dist`).
|
||||
- Add the chosen library to `package.json` dependencies.
|
||||
|
||||
3. **Feature Implementation:**
|
||||
|
||||
- Define MCP tool schemas and implement logic:
|
||||
- `read_pdf_all_text`: Extract all text. Input: `{ "path": "string" }`
|
||||
- `read_pdf_page_text`: Extract text from specific pages. Input:
|
||||
`{ "path": "string", "pages": "number[] | string" }`
|
||||
- `get_pdf_metadata`: Read metadata. Input: `{ "path": "string" }`
|
||||
- `get_pdf_page_count`: Get total page count. Input: `{ "path": "string" }`
|
||||
- Implement core functionality using the chosen PDF library.
|
||||
- Integrate new tools into the existing MCP server framework.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "PDF Tool Implementation"
|
||||
A[Define read_pdf_all_text] --> B{Use PDF Library};
|
||||
C[Define read_pdf_page_text] --> B;
|
||||
D[Define get_pdf_metadata] --> B;
|
||||
E[Define get_pdf_page_count] --> B;
|
||||
B --> F[Implement Logic];
|
||||
F --> G[Integrate into MCP Server];
|
||||
end
|
||||
```
|
||||
|
||||
4. **Documentation & Refinement:**
|
||||
|
||||
- Update `README.md` with new PDF tool descriptions and usage examples.
|
||||
- Update Memory Bank files (`techContext.md`, `systemPatterns.md`,
|
||||
`progress.md`).
|
||||
|
||||
5. **Handover:**
|
||||
- Confirm plan with the user. (Done).
|
||||
- Save plan to `PLAN.md`. (This step).
|
||||
- Switch to "Code" mode for implementation.
|
||||
218
pdf-reader-mcp/README.md
Normal file
218
pdf-reader-mcp/README.md
Normal file
@@ -0,0 +1,218 @@
|
||||
[](https://mseep.ai/app/sylphxltd-pdf-reader-mcp)
|
||||
|
||||
# PDF Reader MCP Server (@sylphlab/pdf-reader-mcp)
|
||||
|
||||
<!-- Status Badges Area -->
|
||||
|
||||
[](https://github.com/sylphlab/pdf-reader-mcp/actions/workflows/ci.yml)
|
||||
[](https://codecov.io/gh/sylphlab/pdf-reader-mcp)
|
||||
[](https://badge.fury.io/js/%40sylphlab%2Fpdf-reader-mcp)
|
||||
[](https://hub.docker.com/r/sylphlab/pdf-reader-mcp)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
<!-- End Status Badges Area -->
|
||||
|
||||
Empower your AI agents (like Cline) with the ability to securely read and extract information (text, metadata, page count) from PDF files within your project context using a single, flexible tool.
|
||||
|
||||
<a href="https://glama.ai/mcp/servers/@sylphlab/pdf-reader-mcp">
|
||||
<img width="380" height="200" src="https://glama.ai/mcp/servers/@sylphlab/pdf-reader-mcp/badge" alt="PDF Reader Server MCP server" />
|
||||
</a>
|
||||
|
||||
## Installation
|
||||
|
||||
### Using npm (Recommended)
|
||||
|
||||
Install as a dependency in your MCP host environment or project:
|
||||
|
||||
```bash
|
||||
pnpm add @sylphlab/pdf-reader-mcp # Or npm install / yarn add
|
||||
```
|
||||
|
||||
Configure your MCP host (e.g., `mcp_settings.json`) to use `npx`:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"pdf-reader-mcp": {
|
||||
"command": "npx",
|
||||
"args": ["@sylphlab/pdf-reader-mcp"],
|
||||
"name": "PDF Reader (npx)"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
_(Ensure the host sets the correct `cwd` for the target project)_
|
||||
|
||||
### Using Docker
|
||||
|
||||
Pull the image:
|
||||
|
||||
```bash
|
||||
docker pull sylphlab/pdf-reader-mcp:latest
|
||||
```
|
||||
|
||||
Configure your MCP host to run the container, mounting your project directory to `/app`:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"pdf-reader-mcp": {
|
||||
"command": "docker",
|
||||
"args": [
|
||||
"run",
|
||||
"-i",
|
||||
"--rm",
|
||||
"-v",
|
||||
"/path/to/your/project:/app", // Or use "$PWD:/app", "%CD%:/app", etc.
|
||||
"sylphlab/pdf-reader-mcp:latest"
|
||||
],
|
||||
"name": "PDF Reader (Docker)"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Local Build (For Development)
|
||||
|
||||
1. Clone: `git clone https://github.com/sylphlab/pdf-reader-mcp.git`
|
||||
2. Install: `cd pdf-reader-mcp && pnpm install`
|
||||
3. Build: `pnpm run build`
|
||||
4. Configure MCP Host:
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"pdf-reader-mcp": {
|
||||
"command": "node",
|
||||
"args": ["/path/to/cloned/repo/pdf-reader-mcp/build/index.js"],
|
||||
"name": "PDF Reader (Local Build)"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
_(Ensure the host sets the correct `cwd` for the target project)_
|
||||
|
||||
## Quick Start
|
||||
|
||||
Assuming the server is running and configured in your MCP host:
|
||||
|
||||
**MCP Request (Get metadata and page 2 text from a local PDF):**
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_name": "read_pdf",
|
||||
"arguments": {
|
||||
"sources": [
|
||||
{
|
||||
"path": "./documents/my_report.pdf",
|
||||
"pages": [2]
|
||||
}
|
||||
],
|
||||
"include_metadata": true,
|
||||
"include_page_count": false, // Default is true, explicitly false here
|
||||
"include_full_text": false // Ignored because 'pages' is specified
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Expected Response Snippet:**
|
||||
|
||||
```json
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"source": "./documents/my_report.pdf",
|
||||
"success": true,
|
||||
"data": {
|
||||
"page_texts": [
|
||||
{ "page": 2, "text": "Text content from page 2..." }
|
||||
],
|
||||
"info": { ... },
|
||||
"metadata": { ... }
|
||||
// num_pages not included as requested
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Why Choose This Project?
|
||||
|
||||
- **🛡️ Secure:** Confines file access strictly to the project root directory.
|
||||
- **🌐 Flexible:** Handles both local relative paths and public URLs.
|
||||
- **🧩 Consolidated:** A single `read_pdf` tool serves multiple extraction needs (full text, specific pages, metadata, page count).
|
||||
- **⚙️ Structured Output:** Returns data in a predictable JSON format, easy for agents to parse.
|
||||
- **🚀 Easy Integration:** Designed for seamless use within MCP environments via `npx` or Docker.
|
||||
- **✅ Robust:** Uses `pdfjs-dist` for reliable parsing and Zod for input validation.
|
||||
|
||||
## Performance Advantages
|
||||
|
||||
Initial benchmarks using Vitest on a sample PDF show efficient handling of various operations:
|
||||
|
||||
| Scenario | Operations per Second (hz) | Relative Speed |
|
||||
| :------------------------------- | :------------------------- | :------------- |
|
||||
| Handle Non-Existent File | ~12,933 | Fastest |
|
||||
| Get Full Text | ~5,575 | |
|
||||
| Get Specific Page (Page 1) | ~5,329 | |
|
||||
| Get Specific Pages (Pages 1 & 2) | ~5,242 | |
|
||||
| Get Metadata & Page Count | ~4,912 | Slowest |
|
||||
|
||||
_(Higher hz indicates better performance. Results may vary based on PDF complexity and environment.)_
|
||||
|
||||
See the [Performance Documentation](./docs/performance/index.md) for more details and future plans.
|
||||
|
||||
## Features
|
||||
|
||||
- Read full text content from PDF files.
|
||||
- Read text content from specific pages or page ranges.
|
||||
- Read PDF metadata (author, title, creation date, etc.).
|
||||
- Get the total page count of a PDF.
|
||||
- Process multiple PDF sources (local paths or URLs) in a single request.
|
||||
- Securely operates within the defined project root.
|
||||
- Provides structured JSON output via MCP.
|
||||
- Available via npm and Docker Hub.
|
||||
|
||||
## Design Philosophy
|
||||
|
||||
The server prioritizes security through context confinement, efficiency via structured data transfer, and simplicity for easy integration into AI agent workflows. It aims for minimal dependencies, relying on the robust `pdfjs-dist` library.
|
||||
|
||||
See the full [Design Philosophy](./docs/design/index.md) documentation.
|
||||
|
||||
## Comparison with Other Solutions
|
||||
|
||||
Compared to direct file access (often infeasible) or generic filesystem tools, this server offers PDF-specific parsing capabilities. Unlike external CLI tools (e.g., `pdftotext`), it provides a secure, integrated MCP interface with structured output, enhancing reliability and ease of use for AI agents.
|
||||
|
||||
See the full [Comparison](./docs/comparison/index.md) documentation.
|
||||
|
||||
## Future Plans (Roadmap)
|
||||
|
||||
- **Documentation:**
|
||||
- Finalize all documentation sections (Guide, API, Design, Comparison).
|
||||
- Resolve TypeDoc issue and generate API documentation.
|
||||
- Add more examples and advanced usage patterns.
|
||||
- Implement PWA support and mobile optimization for the docs site.
|
||||
- Add share buttons and growth metrics to the docs site.
|
||||
- **Benchmarking:**
|
||||
- Conduct comprehensive benchmarks with diverse PDF files (size, complexity).
|
||||
- Measure memory usage.
|
||||
- Compare URL vs. local file performance.
|
||||
- **Core Functionality:**
|
||||
- Explore potential optimizations for very large PDF files.
|
||||
- Investigate options for extracting images or annotations (longer term).
|
||||
- **Testing:**
|
||||
- Increase test coverage towards 100% where practical.
|
||||
- Add runtime tests once feasible.
|
||||
|
||||
## Documentation
|
||||
|
||||
For detailed usage, API reference, and guides, please visit the **[Full Documentation Website](https://sylphlab.github.io/pdf-reader-mcp/)** (Link to be updated upon deployment).
|
||||
|
||||
## Community & Support
|
||||
|
||||
- **Found a bug or have a feature request?** Please open an issue on [GitHub Issues](https://github.com/sylphlab/pdf-reader-mcp/issues).
|
||||
- **Want to contribute?** We welcome contributions! Please see [CONTRIBUTING.md](./CONTRIBUTING.md).
|
||||
- **Star & Watch:** If you find this project useful, please consider starring ⭐ and watching 👀 the repository on [GitHub](https://github.com/sylphlab/pdf-reader-mcp) to show your support and stay updated!
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the [MIT License](./LICENSE).
|
||||
1
pdf-reader-mcp/commitlint.config.cjs
Normal file
1
pdf-reader-mcp/commitlint.config.cjs
Normal file
@@ -0,0 +1 @@
|
||||
module.exports = { extends: ['@commitlint/config-conventional'] };
|
||||
81
pdf-reader-mcp/docs/.vitepress/config.mts
Normal file
81
pdf-reader-mcp/docs/.vitepress/config.mts
Normal file
@@ -0,0 +1,81 @@
|
||||
// docs/.vitepress/config.mts
|
||||
import { defineConfig } from 'vitepress';
|
||||
|
||||
export default defineConfig({
|
||||
lang: 'en-US',
|
||||
title: 'PDF Reader MCP Server',
|
||||
description: 'MCP Server for reading PDF files securely within a project.',
|
||||
lastUpdated: true,
|
||||
|
||||
themeConfig: {
|
||||
logo: '/logo.svg', // Assuming logo is in docs/public
|
||||
nav: [
|
||||
{ text: 'Home', link: '/' },
|
||||
{ text: 'Guide', link: '/guide/' },
|
||||
{ text: 'API Reference', link: '/api/' },
|
||||
{ text: 'Design', link: '/design/' },
|
||||
{ text: 'Performance', link: '/performance/' },
|
||||
{ text: 'Comparison', link: '/comparison/' },
|
||||
],
|
||||
|
||||
sidebar: {
|
||||
'/guide/': [
|
||||
{
|
||||
text: 'Introduction',
|
||||
items: [
|
||||
{ text: 'What is PDF Reader MCP?', link: '/guide/' },
|
||||
{ text: 'Installation', link: '/guide/installation' },
|
||||
{ text: 'Getting Started', link: '/guide/getting-started' },
|
||||
],
|
||||
},
|
||||
// Add more guide sections later
|
||||
],
|
||||
'/api/': [
|
||||
{
|
||||
text: 'API Reference',
|
||||
items: [{ text: 'Tool: read_pdf', link: '/api/read_pdf' }],
|
||||
},
|
||||
],
|
||||
// Add sidebars for other sections
|
||||
'/design/': [
|
||||
{
|
||||
text: 'Design',
|
||||
items: [{ text: 'Philosophy', link: '/design/' }],
|
||||
},
|
||||
],
|
||||
'/performance/': [
|
||||
{
|
||||
text: 'Performance',
|
||||
items: [{ text: 'Benchmarks', link: '/performance/' }],
|
||||
},
|
||||
],
|
||||
'/comparison/': [
|
||||
{
|
||||
text: 'Comparison',
|
||||
items: [{ text: 'Other Solutions', link: '/comparison/' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
socialLinks: [
|
||||
{ icon: 'github', link: 'https://github.com/sylphlab/pdf-reader-mcp' },
|
||||
{ icon: 'issues', link: 'https://github.com/sylphlab/pdf-reader-mcp/issues' }, // Add link to issues
|
||||
],
|
||||
|
||||
footer: {
|
||||
message: 'Released under the MIT License. Found this useful? Give us a star ⭐ on GitHub!', // Add call-to-action
|
||||
copyright: `Copyright © ${new Date().getFullYear()} Sylph Lab`,
|
||||
},
|
||||
|
||||
// Enable edit links
|
||||
editLink: {
|
||||
pattern: 'https://github.com/sylphlab/pdf-reader-mcp/edit/main/docs/:path',
|
||||
text: 'Edit this page on GitHub',
|
||||
},
|
||||
},
|
||||
|
||||
// Enable markdown features
|
||||
markdown: {
|
||||
lineNumbers: true,
|
||||
},
|
||||
});
|
||||
5
pdf-reader-mcp/docs/api/README.md
Normal file
5
pdf-reader-mcp/docs/api/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
**@sylphlab/pdf-reader-mcp**
|
||||
|
||||
---
|
||||
|
||||
# @sylphlab/pdf-reader-mcp
|
||||
84
pdf-reader-mcp/docs/changelog.md
Normal file
84
pdf-reader-mcp/docs/changelog.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Nothing yet.
|
||||
|
||||
## [0.3.9] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed artifact download/extract steps from `publish-docker` job in workflow, as Docker build needs the full source context provided by checkout.
|
||||
|
||||
## [0.3.8] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed duplicate `context: .` entry in `docker/build-push-action` step in `.github/workflows/publish.yml`.
|
||||
|
||||
## [0.3.7] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Removed explicit `COPY tsconfig.json ./` from Dockerfile (rely on `COPY . .`).
|
||||
- Explicitly set `context: .` in docker build-push action.
|
||||
|
||||
## [0.3.6] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Explicitly added `COPY tsconfig.json ./` before `COPY . .` in Dockerfile to ensure it exists before build step.
|
||||
|
||||
## [0.3.5] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Added `RUN ls -la` before build step in Dockerfile to debug `tsconfig.json` not found error.
|
||||
|
||||
## [0.3.4] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Explicitly specify `tsconfig.json` path in Dockerfile build step (`RUN ./node_modules/.bin/tsc -p tsconfig.json`) to debug build failure.
|
||||
|
||||
## [0.3.3] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Changed Dockerfile build step from `RUN npm run build` to `RUN ./node_modules/.bin/tsc` to debug build failure.
|
||||
|
||||
## [0.3.2] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Simplified `build` script in `package.json` to only run `tsc` (removed `chmod`) to debug Docker build failure.
|
||||
|
||||
## [0.3.1] - 2025-04-05
|
||||
|
||||
### Fixed
|
||||
|
||||
- Attempted various fixes for GitHub Actions workflow artifact upload issue (`Error: Provided artifact name input during validation is empty`). Final attempt uses fixed artifact filename in upload/download steps.
|
||||
|
||||
## [0.3.0] - 2025-04-05
|
||||
|
||||
### Added
|
||||
|
||||
- `CHANGELOG.md` file based on Keep a Changelog format.
|
||||
- `LICENSE` file (MIT License).
|
||||
- Improved GitHub Actions workflow (`.github/workflows/publish.yml`):
|
||||
- Triggers on push to `main` branch and version tags (`v*.*.*`).
|
||||
- Conditionally archives build artifacts only on tag pushes.
|
||||
- Conditionally runs `publish-npm` and `publish-docker` jobs only on tag pushes.
|
||||
- Added `create-release` job to automatically create GitHub Releases from tags, using `CHANGELOG.md` for the body.
|
||||
- Added version headers to Memory Bank files (`activeContext.md`, `progress.md`).
|
||||
|
||||
### Changed
|
||||
|
||||
- Bumped version from 0.2.2 to 0.3.0.
|
||||
34
pdf-reader-mcp/docs/comparison/index.md
Normal file
34
pdf-reader-mcp/docs/comparison/index.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Comparison with Other Solutions
|
||||
|
||||
When an AI agent needs to access information within PDF files, several approaches exist. Here's how the PDF Reader MCP Server compares:
|
||||
|
||||
1. **Direct File Access by Agent:**
|
||||
|
||||
- **Feasibility:** Often impossible. PDFs are binary; LLMs typically process text. Sending raw binary data is usually not supported or useful.
|
||||
- **Security:** Extremely risky if the agent has broad filesystem access.
|
||||
- **Efficiency:** Impractical due to file size and format.
|
||||
- **PDF Reader MCP Advantage:** Provides a secure, structured way to get _textual_ data from the binary PDF.
|
||||
|
||||
2. **Generic Filesystem MCP Server (like `@shtse8/filesystem-mcp`):**
|
||||
|
||||
- **Functionality:** Can read file _content_, but for PDFs, this would be the raw binary data, which is not directly useful to an LLM.
|
||||
- **Security:** Offers similar path confinement benefits if implemented correctly.
|
||||
- **Efficiency:** Inefficient for PDFs as it doesn't parse the content.
|
||||
- **PDF Reader MCP Advantage:** Specializes in _parsing_ PDFs to extract meaningful text and metadata.
|
||||
|
||||
3. **External CLI Tools (e.g., `pdftotext`, `pdfinfo`):**
|
||||
|
||||
- **Functionality:** Can extract text and metadata.
|
||||
- **Security:** Requires the agent host to execute arbitrary commands, potentially increasing security risks. Output might need further parsing.
|
||||
- **Efficiency:** Involves process creation overhead for each command. Communication might be less streamlined than MCP.
|
||||
- **Integration:** Requires the agent to know how to construct and interpret CLI commands and output, which can be brittle.
|
||||
- **PDF Reader MCP Advantage:** Offers a dedicated, secure MCP interface with structured JSON input/output, better integration, and potentially lower overhead for frequent operations.
|
||||
|
||||
4. **Cloud-Based PDF APIs:**
|
||||
- **Functionality:** Often provide rich features (OCR, conversion, etc.).
|
||||
- **Security:** Requires sending potentially sensitive local files to a third-party service.
|
||||
- **Efficiency:** Involves network latency and potential costs.
|
||||
- **Integration:** Requires API keys and handling HTTP requests/responses.
|
||||
- **PDF Reader MCP Advantage:** Operates entirely locally (for local files), enhancing security and privacy. No external network dependency for local operations.
|
||||
|
||||
**In summary, the PDF Reader MCP Server provides a balanced solution specifically tailored for AI agents needing secure, efficient, and structured access to PDF content within a local project context.**
|
||||
37
pdf-reader-mcp/docs/contributing.md
Normal file
37
pdf-reader-mcp/docs/contributing.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Contributing to PDF Reader MCP Server
|
||||
|
||||
Thank you for your interest in contributing!
|
||||
|
||||
## How to Contribute
|
||||
|
||||
We welcome contributions in various forms:
|
||||
|
||||
- **Reporting Bugs:** If you find a bug, please open an issue on GitHub detailing the problem, steps to reproduce, and your environment.
|
||||
- **Suggesting Enhancements:** Have an idea for a new feature or improvement? Open an issue to discuss it.
|
||||
- **Pull Requests:** If you'd like to contribute code:
|
||||
1. Fork the repository.
|
||||
2. Create a new branch for your feature or bug fix (`git checkout -b feature/your-feature-name` or `bugfix/issue-number`).
|
||||
3. Make your changes, ensuring they adhere to the project's coding style and principles (see `docs/principles.md`).
|
||||
4. Add tests for any new functionality and ensure all tests pass (`npm test`).
|
||||
5. Ensure code coverage remains high (`npm run test:cov`).
|
||||
6. Make sure your code lints correctly (`npm run lint`).
|
||||
7. Commit your changes using the [Conventional Commits](https://www.conventionalcommits.org/) standard (e.g., `feat: Add support for encrypted PDFs`, `fix: Correct page range parsing`).
|
||||
8. Push your branch to your fork (`git push origin feature/your-feature-name`).
|
||||
9. Open a Pull Request against the `main` branch of the original repository.
|
||||
|
||||
## Development Setup
|
||||
|
||||
1. Clone your fork.
|
||||
2. Install dependencies: `npm install`
|
||||
3. Build the project: `npm run build`
|
||||
4. Run in watch mode during development: `npm run watch`
|
||||
5. Run tests: `npm test` or `npm run test:watch`
|
||||
|
||||
## Code Style
|
||||
|
||||
Please ensure your code adheres to the formatting and linting rules defined in the project:
|
||||
|
||||
- Run `npm run format` to format your code with Prettier.
|
||||
- Run `npm run lint` to check for ESLint issues.
|
||||
|
||||
Thank you for contributing!
|
||||
26
pdf-reader-mcp/docs/design/index.md
Normal file
26
pdf-reader-mcp/docs/design/index.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Design Philosophy
|
||||
|
||||
The PDF Reader MCP Server is built upon several core principles:
|
||||
|
||||
1. **Security First:**
|
||||
|
||||
- **Context Confinement:** The absolute primary goal. All local file access _must_ be restricted to the directory (and its subdirectories) where the server process is launched. This prevents the AI agent from accessing unintended files on the user's system.
|
||||
- **Path Validation:** Rigorous validation of all incoming paths using a dedicated `resolvePath` function ensures they are relative and resolve within the designated project root.
|
||||
- **No Arbitrary Execution:** The server only performs PDF reading operations, not arbitrary file system modifications or command execution.
|
||||
|
||||
2. **Efficiency & Resourcefulness:**
|
||||
|
||||
- **Structured Data:** Instead of sending potentially huge raw PDF content (which is often impractical for LLMs), the server extracts specific, structured information (text, metadata, page count).
|
||||
- **Targeted Extraction:** Allows requesting text from specific pages, minimizing the amount of data transferred and processed.
|
||||
- **Asynchronous Operations:** Uses Node.js async I/O to avoid blocking the event loop during file access and PDF parsing.
|
||||
|
||||
3. **Simplicity & Ease of Integration:**
|
||||
|
||||
- **Single Tool Focus:** Consolidates functionality into a single `read_pdf` tool with clear parameters, making it easier for AI agents to learn and use.
|
||||
- **Standard MCP:** Leverages the `@modelcontextprotocol/sdk` for standard communication and error handling.
|
||||
- **Clear Schemas:** Uses Zod for defining and validating input, providing clear contracts for tool usage.
|
||||
- **Multiple Invocation Methods:** Supports easy use via `npx` or Docker for straightforward deployment in various MCP host environments.
|
||||
|
||||
4. **Minimalism & Reliability:**
|
||||
- **Minimal Dependencies:** Relies primarily on the robust and widely-used `pdfjs-dist` library for core PDF parsing, minimizing external failure points.
|
||||
- **Clear Error Reporting:** Provides specific error messages when processing fails for a source, allowing the agent to understand the issue.
|
||||
83
pdf-reader-mcp/docs/guide/getting-started.md
Normal file
83
pdf-reader-mcp/docs/guide/getting-started.md
Normal file
@@ -0,0 +1,83 @@
|
||||
# Getting Started
|
||||
|
||||
This guide assumes you have an MCP client or host environment capable of launching and communicating with the PDF Reader MCP Server.
|
||||
|
||||
## 1. Launch the Server
|
||||
|
||||
Ensure the server is launched with its **working directory set to the root of the project** containing the PDF files you want to access.
|
||||
|
||||
- **If installed via npm/pnpm:** Your MCP host might manage this automatically via `npx @sylphlab/pdf-reader-mcp`.
|
||||
- **If running standalone:** `cd /path/to/your/project && node /path/to/pdf-reader-mcp/build/index.js`
|
||||
- **If using Docker:** `docker run -i --rm -v \"/path/to/your/project:/app\" sylphlab/pdf-reader-mcp:latest`
|
||||
|
||||
## 2. Using the `read_pdf` Tool
|
||||
|
||||
The server provides a single primary tool: `read_pdf`.
|
||||
|
||||
**Tool Input Schema:**
|
||||
|
||||
The `read_pdf` tool accepts an object with the following properties:
|
||||
|
||||
- `sources` (Array<Object>, required): An array of PDF sources to process. Each source object must contain either a `path` or a `url`.
|
||||
- `path` (string, optional): Relative path to the local PDF file within the project root.
|
||||
- `url` (string, optional): URL of the PDF file.
|
||||
- `pages` (Array<number> | string, optional): Extract text only from specific pages (1-based) or ranges (e.g., `'1-3, 5'`). If provided, `include_full_text` is ignored for this source.
|
||||
- `include_full_text` (boolean, optional, default: `false`): Include the full text content of each PDF (only if `pages` is not specified for that source).
|
||||
- `include_metadata` (boolean, optional, default: `true`): Include metadata and info objects for each PDF.
|
||||
- `include_page_count` (boolean, optional, default: `true`): Include the total number of pages for each PDF.
|
||||
|
||||
_(See the [API Reference](./api/) (once generated) for the full JSON schema)_
|
||||
|
||||
**Example MCP Request (Get metadata and page count for one PDF):**
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_name": "read_pdf",
|
||||
"arguments": {
|
||||
"sources": [{ "path": "./documents/report.pdf" }],
|
||||
"include_metadata": true,
|
||||
"include_page_count": true,
|
||||
"include_full_text": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Example MCP Request (Get text from page 2 of one PDF, full text of another):**
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_name": "read_pdf",
|
||||
"arguments": {
|
||||
"sources": [
|
||||
{
|
||||
"path": "./invoices/inv-001.pdf",
|
||||
"pages": [2] // Get only page 2 text
|
||||
},
|
||||
{
|
||||
"url": "https://example.com/whitepaper.pdf"
|
||||
// No 'pages', so 'include_full_text' applies
|
||||
}
|
||||
],
|
||||
"include_metadata": false,
|
||||
"include_page_count": false,
|
||||
"include_full_text": true // Applies only to the URL source
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 3. Understanding the Response
|
||||
|
||||
The response will be an array named `results`, with each element corresponding to a source object in the request array. Each result object contains:
|
||||
|
||||
- `source` (string): The original path or URL provided in the request.
|
||||
- `success` (boolean): Indicates if processing this source was successful.
|
||||
- `data` (Object, optional): Present if `success` is `true`. Contains the requested data:
|
||||
- `num_pages` (number, optional): Total page count (if `include_page_count` was true).
|
||||
- `info` (Object, optional): PDF information dictionary (if `include_metadata` was true).
|
||||
- `metadata` (Object, optional): PDF metadata (if `include_metadata` was true).
|
||||
- `page_texts` (Array<Object>, optional): Array of objects, each with `page` (number) and `text` (string), for pages where text was extracted (if `pages` was specified or `include_full_text` was true without `pages`).
|
||||
- `error` (Object, optional): Present if `success` is `false`. Contains:
|
||||
- `code` (string): An error code (e.g., `FileNotFound`, `InvalidRequest`, `PdfParsingError`, `DownloadError`, `UnknownError`).
|
||||
- `message` (string): A description of the error.
|
||||
|
||||
_(See the [API Reference](./api/) (once generated) for detailed response structure and error codes.)_
|
||||
22
pdf-reader-mcp/docs/guide/index.md
Normal file
22
pdf-reader-mcp/docs/guide/index.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Introduction
|
||||
|
||||
Welcome to the PDF Reader MCP Server documentation!
|
||||
|
||||
This server provides a secure and efficient way for AI agents (like Cline) using the Model Context Protocol (MCP) to interact with PDF files located within a user's project directory.
|
||||
|
||||
## What Problem Does It Solve?
|
||||
|
||||
AI agents often need information from PDFs (reports, invoices, manuals). Directly feeding PDF content is impractical due to format and size. This server offers specific tools to extract:
|
||||
|
||||
- Full text content
|
||||
- Text from specific pages
|
||||
- Metadata (author, title, etc.)
|
||||
- Total page count
|
||||
|
||||
All interactions happen securely within the defined project boundaries.
|
||||
|
||||
## Core Principles
|
||||
|
||||
- **Security:** Confined file access.
|
||||
- **Efficiency:** Structured data retrieval, avoiding large raw content transfer.
|
||||
- **Simplicity:** Easy integration into MCP-enabled agent workflows.
|
||||
58
pdf-reader-mcp/docs/guide/installation.md
Normal file
58
pdf-reader-mcp/docs/guide/installation.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Installation
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Node.js (>= 18.0.0 recommended)
|
||||
- npm (comes with Node.js)
|
||||
|
||||
## Using npm (Recommended)
|
||||
|
||||
To use the server in your project or MCP host environment, install it as a dependency:
|
||||
|
||||
```bash
|
||||
npm install @sylphlab/pdf-reader-mcp
|
||||
```
|
||||
|
||||
## Running Standalone (for testing/development)
|
||||
|
||||
1. **Clone the repository:**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/sylphlab/pdf-reader-mcp.git
|
||||
cd pdf-reader-mcp
|
||||
```
|
||||
|
||||
2. **Install dependencies:**
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. **Build the project:**
|
||||
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
|
||||
4. **Run the server:**
|
||||
The server communicates via stdio. You'll typically run it from an MCP host.
|
||||
```bash
|
||||
node build/index.js
|
||||
```
|
||||
**Important:** Ensure you run this command from the root directory of the project containing the PDFs you want the server to access.
|
||||
|
||||
## Using Docker
|
||||
|
||||
A Docker image is available on Docker Hub.
|
||||
|
||||
```bash
|
||||
docker pull sylphlab/pdf-reader-mcp:latest
|
||||
```
|
||||
|
||||
To run the container, you need to mount the project directory containing your PDFs into the container's working directory (`/app`):
|
||||
|
||||
```bash
|
||||
docker run -i --rm -v "/path/to/your/project:/app" sylphlab/pdf-reader-mcp:latest
|
||||
```
|
||||
|
||||
Replace `/path/to/your/project` with the actual absolute path to your project folder.
|
||||
26
pdf-reader-mcp/docs/index.md
Normal file
26
pdf-reader-mcp/docs/index.md
Normal file
@@ -0,0 +1,26 @@
|
||||
---
|
||||
layout: home
|
||||
|
||||
hero:
|
||||
name: 'PDF Reader MCP Server'
|
||||
text: 'Securely Read PDFs via MCP.'
|
||||
tagline: An MCP server enabling AI agents to read text, metadata, and page counts from PDF files within a project's context.
|
||||
image:
|
||||
src: /logo.svg
|
||||
alt: PDF Reader MCP Logo
|
||||
actions:
|
||||
- theme: brand
|
||||
text: Get Started
|
||||
link: /guide/getting-started
|
||||
- theme: alt
|
||||
text: View on GitHub
|
||||
link: https://github.com/sylphlab/pdf-reader-mcp
|
||||
|
||||
features:
|
||||
- title: Secure Context
|
||||
details: All operations are strictly confined to the project directory where the server is launched.
|
||||
- title: Structured Data
|
||||
details: Returns parsed text, metadata, and page counts in a structured format via MCP.
|
||||
- title: Efficient & Focused
|
||||
details: Uses pdfjs-dist for reliable parsing. Designed for integration with AI agent workflows.
|
||||
---
|
||||
21
pdf-reader-mcp/docs/license.md
Normal file
21
pdf-reader-mcp/docs/license.md
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 [Your Name or Organization]
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
41
pdf-reader-mcp/docs/performance.md
Normal file
41
pdf-reader-mcp/docs/performance.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# Performance
|
||||
|
||||
Performance is a key consideration for the PDF Reader MCP Server, as slow responses can negatively impact the interaction flow of AI agents.
|
||||
|
||||
## Core Library: `pdfjs-dist`
|
||||
|
||||
The server relies on Mozilla's [pdf.js](https://mozilla.github.io/pdf.js/) (specifically the `pdfjs-dist` distribution) for the heavy lifting of PDF parsing. This library is widely used and generally considered performant for standard PDF documents. However, performance can vary depending on:
|
||||
|
||||
- **PDF Complexity:** Documents with many pages, complex graphics, large embedded fonts, or non-standard structures may take longer to parse.
|
||||
- **Requested Data:** Extracting full text from a very large document will naturally take longer than just retrieving metadata or the page count. Requesting text from only a few specific pages is usually more efficient than extracting the entire text.
|
||||
- **Server Resources:** The performance will also depend on the CPU and memory resources available to the Node.js process running the server.
|
||||
|
||||
## Asynchronous Operations
|
||||
|
||||
All potentially long-running operations, including file reading (for local PDFs), network requests (for URL PDFs), and PDF parsing itself, are handled asynchronously using `async/await`. This prevents the server from blocking the Node.js event loop and allows it to handle other requests or tasks concurrently (though typically an MCP server handles one request at a time from its host).
|
||||
|
||||
## Benchmarking (Planned)
|
||||
|
||||
_(Section to be added)_
|
||||
|
||||
Formal benchmarking is planned to quantify the performance characteristics of the `read_pdf` tool under various conditions.
|
||||
|
||||
**Goals:**
|
||||
|
||||
- Measure the time taken to extract metadata, page count, specific pages, and full text for PDFs of varying sizes and complexities.
|
||||
- Compare the performance of processing local files vs. URLs (network latency will be a factor for URLs).
|
||||
- Identify potential bottlenecks within the handler logic or the `pdfjs-dist` library usage.
|
||||
- Establish baseline performance metrics to track potential regressions in the future.
|
||||
|
||||
**Tools:**
|
||||
|
||||
- We plan to use [Vitest's built-in benchmarking](https://vitest.dev/guide/features.html#benchmarking) (`bench` function) or a dedicated library like [`tinybench`](https://github.com/tinylibs/tinybench).
|
||||
|
||||
Benchmark results will be published in this section once available.
|
||||
|
||||
## Current Optimization Considerations
|
||||
|
||||
- **Lazy Loading:** The `pdfjs-dist` library loads pages on demand when `pdfDocument.getPage()` is called. This means that if only metadata or page count is requested, the entire document's page content doesn't necessarily need to be parsed immediately.
|
||||
- **Selective Extraction:** The ability to request specific pages (`pages` parameter) allows agents to avoid the cost of extracting text from the entire document if only a small portion is needed.
|
||||
|
||||
_(This section will be updated with concrete data and findings as benchmarking is performed.)_
|
||||
45
pdf-reader-mcp/docs/performance/index.md
Normal file
45
pdf-reader-mcp/docs/performance/index.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Performance
|
||||
|
||||
Performance is an important consideration for the PDF Reader MCP Server, especially when dealing with large or complex PDF documents. This page outlines the benchmarking approach and presents results from initial tests.
|
||||
|
||||
## Benchmarking Setup
|
||||
|
||||
Benchmarks are conducted using the [Vitest](https://vitest.dev/) testing framework's built-in `bench` functionality. The tests measure the number of operations per second (hz) for different scenarios using the `read_pdf` handler.
|
||||
|
||||
- **Environment:** Node.js (latest LTS), Windows 11 (as per user environment)
|
||||
- **Test File:** A sample PDF located at `test/fixtures/sample.pdf`. The exact characteristics of this file (size, page count, complexity) will influence the results.
|
||||
- **Methodology:** Each scenario is run for a fixed duration (1000ms) to determine the average operations per second. The benchmark code can be found in `test/benchmark/readPdf.bench.ts`.
|
||||
|
||||
## Initial Benchmark Results
|
||||
|
||||
The following results were obtained on 2025-04-07 using the setup described above:
|
||||
|
||||
| Scenario | Operations per Second (hz) | Relative Speed |
|
||||
| :------------------------------- | :------------------------- | :------------- |
|
||||
| Handle Non-Existent File | ~12,933 | Fastest |
|
||||
| Get Full Text | ~5,575 | |
|
||||
| Get Specific Page (Page 1) | ~5,329 | |
|
||||
| Get Specific Pages (Pages 1 & 2) | ~5,242 | |
|
||||
| Get Metadata & Page Count | ~4,912 | Slowest |
|
||||
|
||||
_(Higher hz indicates better performance)_
|
||||
|
||||
**Interpretation:**
|
||||
|
||||
- Handling errors for non-existent files is the fastest operation as it involves minimal I/O and no PDF parsing.
|
||||
- Extracting the full text was slightly faster than extracting specific pages or just metadata/page count in this particular test run. This might be influenced by the specific structure of `sample.pdf` and potential caching mechanisms within the `pdfjs-dist` library.
|
||||
- Extracting only metadata and page count was slightly slower than full text extraction for this file.
|
||||
|
||||
**Note:** These results are specific to the `sample.pdf` file and the testing environment used. Performance with different PDFs (varying sizes, complexities, versions, or structures) may differ significantly.
|
||||
|
||||
## Future Benchmarking Goals
|
||||
|
||||
Further benchmarks are planned to measure:
|
||||
|
||||
- **Parsing Time:** Time taken to load and parse PDFs of varying sizes (e.g., 1 page, 10 pages, 100 pages, 1000 pages).
|
||||
- **Text Extraction Speed:** More detailed analysis across different page ranges and document structures.
|
||||
- **Memory Usage:** Peak memory consumption during processing of different PDF sizes.
|
||||
- **URL vs. Local File:** Performance difference between processing local files and downloading/processing from URLs.
|
||||
- **Comparison:** Comparison with other PDF processing methods or libraries, if applicable.
|
||||
|
||||
Results will be updated here as more comprehensive testing is completed.
|
||||
45
pdf-reader-mcp/docs/principles.md
Normal file
45
pdf-reader-mcp/docs/principles.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Development Principles
|
||||
|
||||
This project adheres to the following core principles, based on the provided TypeScript Project Development Guidelines:
|
||||
|
||||
## 1. Impact-Driven
|
||||
|
||||
The primary goal is to solve the real problem of AI agents needing access to PDF content securely and efficiently. Features are added to serve this core purpose.
|
||||
|
||||
## 2. Simplicity & Minimalism
|
||||
|
||||
We aim for the most direct approach:
|
||||
|
||||
- A single, consolidated `read_pdf` tool instead of multiple specific tools.
|
||||
- Leveraging the robust `pdfjs-dist` library for core parsing.
|
||||
- Avoiding unnecessary abstractions.
|
||||
|
||||
## 3. Functional Programming Style (Influences)
|
||||
|
||||
While not strictly functional, the code emphasizes:
|
||||
|
||||
- Pure helper functions where possible (like path resolution checks).
|
||||
- Minimizing side effects within core logic (parsing doesn't alter files).
|
||||
- Using standard asynchronous patterns (`async/await`) effectively.
|
||||
|
||||
## 4. Minimal Dependencies
|
||||
|
||||
- Core functionality relies on `@modelcontextprotocol/sdk` and `pdfjs-dist`.
|
||||
- Development dependencies are standard tools (TypeScript, ESLint, Prettier, Vitest).
|
||||
- Dependencies like `glob`, `zod`, `zod-to-json-schema` provide essential validation and utility.
|
||||
- Unused dependencies inherited from the template (`diff`, `detect-indent`) have been removed.
|
||||
|
||||
## 5. Code Quality & Consistency
|
||||
|
||||
- **Strict TypeScript:** Using the strictest compiler options (`strict: true`, etc.).
|
||||
- **Rigorous Linting:** Employing ESLint with recommended and strict type-checked rules.
|
||||
- **Consistent Formatting:** Enforced by Prettier.
|
||||
- **Comprehensive Testing:** Aiming for high test coverage (currently ~95%) using Vitest, with a 100% threshold configured.
|
||||
|
||||
## 6. Security Focus
|
||||
|
||||
- Path traversal prevention is critical. All file paths are resolved relative to the project root and validated.
|
||||
|
||||
## 7. No Sponsorship
|
||||
|
||||
This project does not accept financial contributions, and all related information has been removed.
|
||||
5
pdf-reader-mcp/docs/public/logo.svg
Normal file
5
pdf-reader-mcp/docs/public/logo.svg
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- Placeholder Logo -->
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
|
||||
<rect width="100" height="100" fill="#cccccc"/>
|
||||
<text x="50%" y="50%" dominant-baseline="middle" text-anchor="middle" font-size="12" fill="#333333">LOGO</text>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 259 B |
60
pdf-reader-mcp/docs/testing.md
Normal file
60
pdf-reader-mcp/docs/testing.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# Testing Strategy
|
||||
|
||||
Robust testing is essential for ensuring the reliability, correctness, and security of the PDF Reader MCP Server. We employ a multi-faceted testing approach using Vitest.
|
||||
|
||||
## Framework: Vitest
|
||||
|
||||
We use [Vitest](https://vitest.dev/) as our primary testing framework. Its key advantages include:
|
||||
|
||||
- **Speed:** Fast execution powered by Vite.
|
||||
- **Modern Features:** Supports ES Modules, TypeScript out-of-the-box.
|
||||
- **Compatibility:** Familiar API similar to Jest.
|
||||
- **Integrated Coverage:** Built-in support for code coverage analysis using `v8` or `istanbul`.
|
||||
|
||||
## Goals & Approach
|
||||
|
||||
Our testing strategy focuses on:
|
||||
|
||||
1. **High Code Coverage:**
|
||||
|
||||
- **Target:** 100% statement, branch, function, and line coverage.
|
||||
- **Configuration:** Enforced via `thresholds` in `vitest.config.ts`.
|
||||
- **Current Status:** ~95%. The remaining uncovered lines are primarily in error handling paths that are difficult to trigger due to Zod's upfront validation or represent extreme edge cases. This level is currently accepted.
|
||||
- **Tool:** Coverage reports generated using `@vitest/coverage-v8`.
|
||||
|
||||
2. **Correctness & Functionality:**
|
||||
|
||||
- **Unit Tests:** (Currently minimal, focus is on integration) Could test utility functions like `pathUtils` in isolation.
|
||||
- **Integration Tests:** The primary focus is testing the `read_pdf` handler (`test/handlers/readPdf.test.ts`) with mocked dependencies (`pdfjs-dist`, `fs`). These tests verify:
|
||||
- Correct parsing of various input arguments (paths, URLs, page selections, flags).
|
||||
- Successful extraction of full text, specific page text, metadata, and page counts.
|
||||
- Handling of multiple sources (local and URL) within a single request.
|
||||
- Correct formatting of the JSON response.
|
||||
- Graceful error handling for invalid inputs (caught by Zod or handler logic).
|
||||
- Correct error reporting for file-not-found errors.
|
||||
- Correct error reporting for PDF loading/parsing failures (mocked).
|
||||
- Proper handling of warnings (e.g., requested pages out of bounds).
|
||||
- **Security:** Path resolution logic (`resolvePath`) is tested separately (`test/pathUtils.test.ts`) to ensure it prevents path traversal and correctly handles relative paths within the project root.
|
||||
|
||||
3. **Reliability & Consistency:**
|
||||
- Tests are designed to be independent and repeatable.
|
||||
- Mocking is used extensively to isolate the handler logic from external factors.
|
||||
|
||||
## Running Tests
|
||||
|
||||
Use the following npm scripts:
|
||||
|
||||
- **`npm test`**: Run all tests once.
|
||||
- **`npm run test:watch`**: Run tests in an interactive watch mode, re-running on file changes.
|
||||
- **`npm run test:cov`**: Run all tests and generate a detailed coverage report in the `./coverage/` directory (view `index.html` in that directory for an interactive report). This command will fail if coverage thresholds are not met.
|
||||
|
||||
## Test File Structure
|
||||
|
||||
- Tests reside in the `test/` directory, mirroring the `src/` structure.
|
||||
- Handler tests are in `test/handlers/`.
|
||||
- Utility tests are in `test/utils/`.
|
||||
|
||||
## Future Improvements
|
||||
|
||||
- Consider adding end-to-end tests using a test MCP client/host.
|
||||
- Explore property-based testing for more robust input validation checks.
|
||||
93
pdf-reader-mcp/eslint.config.js
Normal file
93
pdf-reader-mcp/eslint.config.js
Normal file
@@ -0,0 +1,93 @@
|
||||
import eslint from '@eslint/js';
|
||||
import tseslint from 'typescript-eslint';
|
||||
import eslintConfigPrettier from 'eslint-config-prettier'; // Import prettier config
|
||||
|
||||
export default tseslint.config(
|
||||
eslint.configs.recommended,
|
||||
...tseslint.configs.recommended, // Basic recommended rules - Apply broadly
|
||||
{
|
||||
// Global ignores
|
||||
ignores: [
|
||||
'node_modules/',
|
||||
'build/',
|
||||
'dist/', // Add dist
|
||||
'coverage/', // Add coverage
|
||||
'docs/.vitepress/cache/', // Ignore vitepress cache
|
||||
'docs/.vitepress/dist/', // Ignore vitepress build output
|
||||
'eslint.config.js',
|
||||
],
|
||||
},
|
||||
// Configuration specific to TypeScript files, including type-aware rules
|
||||
...tseslint.config({
|
||||
files: ['**/*.ts'],
|
||||
extends: [
|
||||
...tseslint.configs.strictTypeChecked, // Apply strictest type-aware rules ONLY to TS files
|
||||
...tseslint.configs.stylisticTypeChecked, // Apply stylistic rules requiring TS config
|
||||
],
|
||||
languageOptions: {
|
||||
parserOptions: {
|
||||
project: './tsconfig.eslint.json', // Point to specific tsconfig for ESLint
|
||||
tsconfigRootDir: import.meta.dirname,
|
||||
},
|
||||
},
|
||||
rules: {
|
||||
// General JS/TS Rules (applied within TS context)
|
||||
'no-console': ['warn', { allow: ['warn', 'error', 'info'] }],
|
||||
'prefer-const': 'error',
|
||||
eqeqeq: ['error', 'always'],
|
||||
'no-unused-vars': 'off', // Use TS version
|
||||
complexity: ['error', { max: 10 }],
|
||||
'max-lines': ['warn', { max: 300, skipBlankLines: true, skipComments: true }],
|
||||
'max-lines-per-function': ['warn', { max: 50, skipBlankLines: true, skipComments: true }],
|
||||
'max-depth': ['warn', 3],
|
||||
'max-params': ['warn', 4],
|
||||
|
||||
// TypeScript Specific Rules (override/add)
|
||||
'@typescript-eslint/no-unused-vars': [
|
||||
'error',
|
||||
{ argsIgnorePattern: '^_', varsIgnorePattern: '^_' },
|
||||
],
|
||||
'@typescript-eslint/no-explicit-any': 'error',
|
||||
'@typescript-eslint/explicit-function-return-type': 'error',
|
||||
'@typescript-eslint/no-non-null-assertion': 'error',
|
||||
'@typescript-eslint/no-use-before-define': 'error',
|
||||
'@typescript-eslint/no-floating-promises': 'error',
|
||||
'@typescript-eslint/consistent-type-imports': 'error',
|
||||
'@typescript-eslint/no-misused-promises': 'error',
|
||||
'@typescript-eslint/prefer-readonly': 'warn',
|
||||
},
|
||||
}),
|
||||
{
|
||||
// Configuration for specific files to relax rules
|
||||
files: [
|
||||
'src/handlers/readPdf.ts',
|
||||
'test/**/*.ts', // Includes .test.ts and .bench.ts
|
||||
],
|
||||
rules: {
|
||||
complexity: 'off',
|
||||
'max-lines': 'off',
|
||||
'max-lines-per-function': 'off',
|
||||
'max-depth': 'off', // Also disable max-depth for these complex files/tests
|
||||
'@typescript-eslint/no-unsafe-call': 'warn', // Downgrade unsafe-call to warning for tests if needed
|
||||
'@typescript-eslint/no-unsafe-assignment': 'warn', // Downgrade related rule
|
||||
'@typescript-eslint/no-unsafe-member-access': 'warn', // Downgrade related rule
|
||||
},
|
||||
},
|
||||
{
|
||||
// Configuration for JavaScript files (CommonJS like config files)
|
||||
files: ['**/*.js', '**/*.cjs'], // Include .cjs files
|
||||
languageOptions: {
|
||||
globals: {
|
||||
module: 'readonly', // Define CommonJS globals
|
||||
require: 'readonly',
|
||||
process: 'readonly',
|
||||
__dirname: 'readonly',
|
||||
},
|
||||
},
|
||||
rules: {
|
||||
// Add JS/CJS specific rules if needed
|
||||
'@typescript-eslint/no-var-requires': 'off', // Allow require in CJS if needed
|
||||
},
|
||||
},
|
||||
eslintConfigPrettier // Add prettier config last to override other formatting rules
|
||||
);
|
||||
104
pdf-reader-mcp/memory-bank/activeContext.md
Normal file
104
pdf-reader-mcp/memory-bank/activeContext.md
Normal file
@@ -0,0 +1,104 @@
|
||||
<!-- Version: 1.36 | Last Updated: 2025-04-07 | Updated By: Sylph -->
|
||||
|
||||
# Active Context: PDF Reader MCP Server (Guidelines Alignment)
|
||||
|
||||
## 1. Current Focus
|
||||
|
||||
Project alignment and documentation according to Sylph Lab Playbook guidelines are complete. CI workflow fixed (formatting, publish step, Dockerfile, parallelization, pre-commit hook), Test Analytics integrated, and Git history corrected multiple times. Dockerfile updated to use LTS Node. Version bumped to `0.3.16` and pushed successfully.
|
||||
|
||||
## 2. Recent Changes (Chronological Summary)
|
||||
|
||||
- Cloned `filesystem-mcp` as a base.
|
||||
- Updated `package.json` (name, version, description).
|
||||
- Implemented initial PDF tools using `pdf-parse`.
|
||||
- Removed unused filesystem handlers.
|
||||
- Added URL support to `pdf-parse` based tools.
|
||||
- Consolidated tools into a single `read_pdf` handler.
|
||||
- **Switched PDF Library:** Uninstalled `pdf-parse`, installed `pdfjs-dist`.
|
||||
- Rewrote the `read_pdf` handler (`src/handlers/readPdf.ts`) to use `pdfjs-dist`.
|
||||
- Updated `README.md` and Memory Bank files to reflect the switch to `pdfjs-dist` and the consolidated tool.
|
||||
- **Added Multiple Source Support & Per-Source Pages:** Modified `read_pdf` handler and schema to accept an array of `sources`. Moved the optional `pages` parameter into each source object.
|
||||
- Created `CHANGELOG.md` and `LICENSE`.
|
||||
- Updated `.github/workflows/publish.yml` initially.
|
||||
- **Guidelines Alignment (Initial):**
|
||||
- Removed sponsorship information (`.github/FUNDING.yml`, `README.md` badges).
|
||||
- Updated `package.json` scripts (`lint`, `format`, `validate`, added `test:watch`, etc.) and removed unused dependencies.
|
||||
- Verified `tsconfig.json`, `eslint.config.js`, `.prettierrc.cjs`, `vitest.config.ts` alignment.
|
||||
- Updated `.gitignore`.
|
||||
- Refactored GitHub Actions workflow to `.github/workflows/ci.yml`.
|
||||
- Added tests (~95% coverage).
|
||||
- Updated Project Identity (`sylphlab` scope).
|
||||
- **Guidelines Alignment (Configuration Deep Dive):**
|
||||
- Updated `package.json` with missing metadata, dev dependencies (`husky`, `lint-staged`, `commitlint`, `typedoc`, `standard-version`), scripts (`start`, `typecheck`, `prepare`, `benchmark`, `release`, `clean`, `docs:api`, `prepublishOnly`), and `files` array.
|
||||
- Updated `tsconfig.json` with missing compiler options and refined `exclude` array.
|
||||
- Updated `eslint.config.js` to enable `stylisticTypeChecked`, enforce stricter rules (`no-unused-vars`, `no-explicit-any` to `error`), and add missing recommended rules.
|
||||
- Created `.github/dependabot.yml` for automated dependency updates.
|
||||
- Updated `.github/workflows/ci.yml` to use fixed Action versions and add Coveralls integration.
|
||||
- Set up Git Hooks using Husky (`pre-commit` with `lint-staged`, `commit-msg` with `commitlint`) and created `commitlint.config.cjs`.
|
||||
- **Benchmarking & Documentation:**
|
||||
- Created initial benchmark file, fixed TS errors, and successfully ran benchmarks (`pnpm run benchmark`) after user provided `test/fixtures/sample.pdf`.
|
||||
- Updated `docs/performance/index.md` with benchmark setup and initial results.
|
||||
- **API Doc Generation:**
|
||||
- Initially encountered persistent TypeDoc v0.28.1 initialization error with Node.js script.
|
||||
- **Resolved:** Changed `docs:api` script in `package.json` to directly call TypeDoc CLI (`typedoc --entryPoints ...`). Successfully generated API docs.
|
||||
- **Documentation Finalization:**
|
||||
- Reviewed and updated `README.md`, `docs/guide/getting-started.md`, and VitePress config (`docs/.vitepress/config.mts`) based on guidelines.
|
||||
- **Code Commit:** Committed and pushed all recent changes.
|
||||
- **CI Fixes & Enhancements:**
|
||||
- Fixed Prettier formatting issues identified by CI.
|
||||
- Fixed ESLint errors/warnings (`no-undef`, `no-unused-vars`, `no-unsafe-call`, `require-await`, unused eslint-disable) identified by CI.
|
||||
- Deleted unused `scripts/generate-api-docs.mjs` file.
|
||||
- **Fixed `pnpm publish` error:** Added `--no-git-checks` flag to the publish command in `.github/workflows/ci.yml` to resolve `ERR_PNPM_GIT_UNCLEAN` error during tag-triggered publish jobs.
|
||||
- **Integrated Codecov Test Analytics:** Updated `package.json` to generate JUnit XML test reports and added `codecov/test-results-action@v1` to `.github/workflows/ci.yml` to upload them.
|
||||
- Added `test-report.junit.xml` to `.gitignore`.
|
||||
- **Switched Coverage Tool:** Updated `.github/workflows/ci.yml` to replace Coveralls with Codecov based on user feedback. Added Codecov badge to `README.md`.
|
||||
- **Version Bump & CI Saga (0.3.11 -> 0.3.16):**
|
||||
- **Initial Goal (0.3.11):** Fix CI publish error (`--no-git-checks`), integrate Test Analytics, add `.gitignore` entry.
|
||||
- **Problem 1:** Incorrect Git history manipulation led to pushing an incomplete `v0.3.11`.
|
||||
- **Problem 2:** Force push/re-push of corrected `v0.3.11` / `v0.3.12` / `v0.3.13` / `v0.3.14` tags didn't trigger workflow or failed on CI checks.
|
||||
- **Problem 3:** CI failed on `check-format` due to unformatted `ci.yml` / `CHANGELOG.md` (not caught by pre-commit hook initially).
|
||||
- **Problem 4:** Further Git history confusion led to incorrect version bumps (`0.3.13`, `0.3.14`, `0.3.15`) and tag creation issues due to unstaged changes and leftover local tags.
|
||||
- **Problem 5:** Docker build failed due to incorrect lockfile and missing `pnpm` install in `Dockerfile`.
|
||||
- **Problem 6:** Workflow parallelization changes were not committed before attempting a release.
|
||||
- **Problem 7:** `publish-npm` job failed due to missing dependencies for `prepublishOnly` script.
|
||||
- **Problem 8:** `pre-commit` hook was running `pnpm test` instead of `pnpm lint-staged`.
|
||||
- **Problem 9:** Docker build failed again due to `husky` command not found during `pnpm prune`.
|
||||
- **Problem 10:** Dockerfile was using hardcoded `node:20-alpine` instead of `node:lts-alpine`.
|
||||
- **Final Resolution:** Reset history multiple times, applied fixes sequentially (formatting `fe7eda1`, Dockerfile pnpm install `c202fd4`, parallelization `a569b62`, pre-commit/npm-publish fix `e96680c`, Dockerfile prune fix `02f3f91`, Dockerfile LTS `50f9bdd`), ensured clean working directory, ran `standard-version` successfully to create `v0.3.16` commit and tag, pushed `main` and tag `v0.3.16`.
|
||||
- **Fixed `package.json` Paths:** Corrected `bin`, `files`, and `start` script paths from `build/` to `dist/` to align with `tsconfig.json` output directory and resolve executable error.
|
||||
- **Committed & Pushed Fix:** Committed (`ab1100d`) and pushed the `package.json` path fix to `main`.
|
||||
- **Version Bump & Push:** Bumped version to `0.3.17` using `standard-version` (commit `bb9d2e5`) and pushed the commit and tag `v0.3.17` to `main`.
|
||||
|
||||
## 3. Next Steps
|
||||
|
||||
- **Build Completed:** Project successfully built (`pnpm run build`).
|
||||
- **GitHub Actions Status:**
|
||||
- Pushed commit `c150022` (CI run `14298157760` **passed** format/lint/test checks, but **failed** at Codecov upload due to missing `CODECOV_TOKEN`).
|
||||
- Pushed tag `v0.3.10` (Triggered publish/release workflow - status needed verification).
|
||||
- **Pushed tag `v0.3.16`**. Publish/release workflow triggered. Status needs verification.
|
||||
- **Runtime Testing (Blocked):** Requires user interaction with `@modelcontextprotocol/inspector` or a live agent. Skipping for now.
|
||||
- **Documentation Finalization (Mostly Complete):**
|
||||
- API docs generated.
|
||||
- Main pages reviewed/updated.
|
||||
- Codecov badge added (requires manual token update in `README.md`).
|
||||
- **Remaining:** Add complex features (PWA, share buttons, roadmap page) if requested.
|
||||
- **Release Preparation:**
|
||||
- `CHANGELOG.md` updated for `0.3.10`.
|
||||
- **Project is ready for final review. Requires Codecov token configuration and verification of the `v0.3.16` publish/release workflow.**
|
||||
|
||||
## 4. Active Decisions & Considerations
|
||||
|
||||
- **Switched to pnpm:** Changed package manager from npm to pnpm.
|
||||
- **Using `pdfjs-dist` as the core PDF library.**
|
||||
- Adopted the handler definition pattern from `filesystem-mcp`.
|
||||
- Consolidated tools into a single `read_pdf` handler.
|
||||
- Aligned project configuration with Guidelines.
|
||||
- **Accepted ~95% test coverage**.
|
||||
- **No Sponsorship:** Project will not include sponsorship links or files.
|
||||
- **Using TypeDoc CLI for API Doc Generation:** Bypassed script initialization issues.
|
||||
- **Switched to Codecov:** Replaced Coveralls with Codecov for coverage reporting. Test Analytics integration added.
|
||||
- **Codecov Token Required:** CI is currently blocked on Codecov upload (coverage and test results) due to missing `CODECOV_TOKEN` secret in GitHub repository settings. This needs to be added by the user.
|
||||
- **Version bumped to `0.3.17`**.
|
||||
- **Publish Workflow:** Parallelized. Modified to bypass Git checks during `pnpm publish`. Docker build fixed (pnpm install, prune ignore scripts, LTS node). Dependencies installed before publish. Verification pending on the `v0.3.17` workflow run.
|
||||
- **CI Workflow:** Added Codecov Test Analytics upload step. Formatting fixed. Parallelized publish steps.
|
||||
- **Pre-commit Hook:** Fixed to run `lint-staged`.
|
||||
40
pdf-reader-mcp/memory-bank/productContext.md
Normal file
40
pdf-reader-mcp/memory-bank/productContext.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Product Context: PDF Reader MCP Server
|
||||
|
||||
## 1. Problem Solved
|
||||
|
||||
AI agents often need to access information contained within PDF documents as
|
||||
part of user tasks (e.g., summarizing reports, extracting data from invoices,
|
||||
referencing documentation). Directly providing PDF file content to the agent is
|
||||
inefficient (large token count) and often impossible due to binary format.
|
||||
Executing external CLI tools for each PDF interaction can be slow, insecure, and
|
||||
lack structured output.
|
||||
|
||||
This MCP server provides a secure, efficient, and structured way for agents to
|
||||
interact with PDF files within the user's project context.
|
||||
|
||||
## 2. How It Should Work
|
||||
|
||||
- The server runs as a background process, managed by the agent's host
|
||||
environment.
|
||||
- The host environment ensures the server is launched with its working directory
|
||||
set to the user's current project root.
|
||||
- The agent uses MCP calls to invoke specific PDF reading tools provided by the
|
||||
server.
|
||||
- The agent provides the relative path to the target PDF file within the project
|
||||
root.
|
||||
- The server uses the `pdf-parse` library to process the PDF.
|
||||
- The server returns structured data (text, metadata, page count) back to the
|
||||
agent via MCP.
|
||||
- All file access is strictly limited to the project root directory.
|
||||
|
||||
## 3. User Experience Goals
|
||||
|
||||
- **Seamless Integration:** The agent should be able to use the PDF tools
|
||||
naturally as part of its workflow without complex setup for the end-user.
|
||||
- **Reliability:** Tools should reliably parse standard PDF files and return
|
||||
accurate information or clear error messages.
|
||||
- **Security:** Users should trust that the server only accesses files within
|
||||
the intended project scope.
|
||||
- **Efficiency:** Reading PDF data should be reasonably fast and avoid excessive
|
||||
token usage compared to sending raw file content (which isn't feasible
|
||||
anyway).
|
||||
61
pdf-reader-mcp/memory-bank/progress.md
Normal file
61
pdf-reader-mcp/memory-bank/progress.md
Normal file
@@ -0,0 +1,61 @@
|
||||
<!-- Version: 1.37 | Last Updated: 2025-04-07 | Updated By: Sylph -->
|
||||
|
||||
# Progress: PDF Reader MCP Server (Guidelines Applied)
|
||||
|
||||
## 1. What Works
|
||||
|
||||
- **Project Setup:** Cloned from `filesystem-mcp`, dependencies installed (using pnpm).
|
||||
- **Core Tool Handler (Consolidated, using `pdfjs-dist`, multi-source, per-source pages):**
|
||||
- `read_pdf`: Implemented and integrated.
|
||||
- **MCP Server Structure:** Basic server setup working.
|
||||
- **Changelog:** `CHANGELOG.md` created and updated for `1.0.0`.
|
||||
- **License:** `LICENSE` file created (MIT).
|
||||
- **GitHub Actions:** `.github/workflows/ci.yml` refactored for CI/CD according to guidelines. Fixed `pnpm publish` step (`--no-git-checks`), added Test Analytics upload, fixed formatting, fixed Docker build step (`Dockerfile` - pnpm install, prune, LTS node), parallelized publish jobs, fixed pre-commit hook. Git history corrected multiple times.
|
||||
- **Testing Framework (Vitest):**
|
||||
- Integrated, configured. All tests passing. Coverage at ~95% (accepted).
|
||||
- **Linter (ESLint):**
|
||||
- Integrated, configured. Codebase passes all checks.
|
||||
- **Formatter (Prettier):**
|
||||
- Integrated, configured. Codebase formatted.
|
||||
- **TypeScript Configuration:** `tsconfig.json` updated with strictest settings.
|
||||
- **Package Configuration:** `package.json` updated.
|
||||
- **Git Ignore:** `.gitignore` updated (added JUnit report).
|
||||
- **Sponsorship:** Removed.
|
||||
- **Project Identity:** Updated scope to `@sylphlab`.
|
||||
- **Git Hooks:** Configured using Husky, lint-staged, and commitlint.
|
||||
- **Dependency Updates:** Configured using Dependabot.
|
||||
- **Compilation:** Completed successfully (`pnpm run build`).
|
||||
- **Benchmarking:**
|
||||
- Created and ran initial benchmarks.
|
||||
- **Documentation (Mostly Complete):**
|
||||
- VitePress site setup.
|
||||
- `README.md`, Guide, Design, Performance, Comparison sections reviewed/updated.
|
||||
- `CONTRIBUTING.md` created.
|
||||
- Performance section updated with benchmark results.
|
||||
- **API documentation generated successfully using TypeDoc CLI.**
|
||||
- VitePress config updated with minor additions.
|
||||
- **Version Control:** All recent changes committed (incl. formatting `fe7eda1`, Dockerfile pnpm install `c202fd4`, parallelization `a569b62`, pre-commit/npm-publish fix `e96680c`, Dockerfile prune fix `02f3f91`, Dockerfile LTS `50f9bdd`, `package.json` path fix `ab1100d`, release commit for `v0.3.17` `bb9d2e5`). Tag `v0.3.17` created and pushed.
|
||||
- **Package Executable Path:** Fixed incorrect paths (`build/` -> `dist/`) in `package.json` (`bin`, `files`, `start` script).
|
||||
|
||||
## 2. What's Left to Build/Verify
|
||||
|
||||
- **Runtime Testing (Blocked):** Requires user interaction.
|
||||
- **Publishing Workflow Test:** Triggered by pushing tag `v0.3.17`. Needs verification.
|
||||
- **Documentation (Optional Enhancements):**
|
||||
- Add complex features (PWA, share buttons, roadmap page) if requested.
|
||||
- **Release Preparation:**
|
||||
- Final review before tagging `1.0.0`.
|
||||
- Consider using `standard-version` or similar for final release tagging/publishing.
|
||||
|
||||
## 3. Current Status
|
||||
|
||||
Project configuration and core functionality are aligned with guidelines. Documentation is largely complete, including generated API docs. Codebase passes all checks and tests (~95% coverage). **Version bumped to `0.3.17` and tag pushed. Project is ready for final review and workflow verification.**
|
||||
|
||||
## 4. Known Issues/Risks
|
||||
|
||||
- **100% Coverage Goal:** Currently at **~95%**. This level is deemed acceptable.
|
||||
- **`pdfjs-dist` Complexity:** API complexity, text extraction accuracy depends on PDF, potential Node.js compatibility nuances.
|
||||
- **Error Handling:** Basic handling implemented; specific PDF parsing errors might need refinement.
|
||||
- **Performance:** Initial benchmarks run on a single sample file. Performance on diverse PDFs needs further investigation if issues arise.
|
||||
- **Per-Source Pages:** Logic handles per-source `pages`; testing combinations is important (covered partially by benchmarks).
|
||||
- **TypeDoc Script Issue:** Node.js script for TypeDoc failed, but CLI workaround is effective.
|
||||
35
pdf-reader-mcp/memory-bank/projectbrief.md
Normal file
35
pdf-reader-mcp/memory-bank/projectbrief.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Project Brief: PDF Reader MCP Server
|
||||
|
||||
## 1. Project Goal
|
||||
|
||||
To create a Model Context Protocol (MCP) server that allows AI agents (like
|
||||
Cline) to securely read and extract information (text, metadata, page count)
|
||||
from PDF files located within a specified project directory.
|
||||
|
||||
## 2. Core Requirements
|
||||
|
||||
- Implement an MCP server using Node.js and TypeScript.
|
||||
- Base the server on the existing `@shtse8/filesystem-mcp` structure.
|
||||
- Provide MCP tools for:
|
||||
- Reading all text content from a PDF.
|
||||
- Reading text content from specific pages of a PDF.
|
||||
- Reading metadata from a PDF.
|
||||
- Getting the total page count of a PDF.
|
||||
- Ensure all operations are confined to the project root directory determined at
|
||||
server launch.
|
||||
- Use relative paths for all file operations.
|
||||
- Utilize the `pdf-parse` library for PDF processing.
|
||||
- Maintain clear documentation (README, Memory Bank).
|
||||
- Package the server for distribution via npm and Docker Hub.
|
||||
|
||||
## 3. Scope
|
||||
|
||||
- **In Scope:** Implementing the core PDF reading tools, packaging, basic
|
||||
documentation.
|
||||
- **Out of Scope (Initially):** Advanced PDF features (image extraction,
|
||||
annotation reading, form filling), complex error recovery beyond basic file
|
||||
access/parsing errors, UI for the server.
|
||||
|
||||
## 4. Target User
|
||||
|
||||
AI agents interacting with user projects that contain PDF documents.
|
||||
94
pdf-reader-mcp/memory-bank/systemPatterns.md
Normal file
94
pdf-reader-mcp/memory-bank/systemPatterns.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# System Patterns: PDF Reader MCP Server
|
||||
|
||||
## 1. Architecture Overview
|
||||
|
||||
The PDF Reader MCP server is a standalone Node.js application based on the
|
||||
original Filesystem MCP. It's designed to run as a child process, communicating
|
||||
with its parent (the AI agent host) via standard input/output (stdio) using the
|
||||
Model Context Protocol (MCP) to provide PDF reading capabilities.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
A[Agent Host Environment] -- MCP over Stdio --> B(PDF Reader MCP Server);
|
||||
B -- Node.js fs/path/pdfjs-dist --> C[User Filesystem (Project Root)];
|
||||
C -- Results/Data --> B;
|
||||
B -- MCP over Stdio --> A;
|
||||
```
|
||||
|
||||
## 2. Key Technical Decisions & Patterns
|
||||
|
||||
- **MCP SDK Usage:** Leverages the `@modelcontextprotocol/sdk` for handling MCP
|
||||
communication (request parsing, response formatting, error handling). This
|
||||
standardizes interaction and reduces boilerplate code.
|
||||
- **Stdio Transport:** Uses `StdioServerTransport` from the SDK for
|
||||
communication, suitable for running as a managed child process.
|
||||
- **Asynchronous Operations:** All filesystem interactions and request handling
|
||||
are implemented using `async/await` and Node.js's promise-based `fs` module
|
||||
(`fs.promises`) for non-blocking I/O.
|
||||
- **Strict Path Resolution:** A dedicated `resolvePath` function is used for
|
||||
_every_ path received from the agent.
|
||||
- It normalizes the path.
|
||||
- It resolves the path relative to the server process's current working
|
||||
directory (`process.cwd()`), which is treated as the `PROJECT_ROOT`.
|
||||
**Crucially, this requires the process launching the server (e.g., the agent
|
||||
host) to set the correct `cwd` for the target project.**
|
||||
- It explicitly checks if the resolved absolute path still starts with the
|
||||
`PROJECT_ROOT` absolute path to prevent path traversal vulnerabilities
|
||||
(e.g., `../../sensitive-file`).
|
||||
- It rejects absolute paths provided by the agent.
|
||||
- **Zod for Schemas & Validation:** Uses `zod` library to define input schemas
|
||||
for tools and perform robust validation within each handler. JSON schemas for
|
||||
MCP listing are generated from Zod schemas.
|
||||
- **Tool Definition Aggregation:** Tool definitions (name, description, Zod
|
||||
schema, handler function) are defined in their respective handler files and
|
||||
aggregated in `src/handlers/index.ts` for registration in `src/index.ts`.
|
||||
- **`edit_file` Logic:**
|
||||
- Processes multiple changes per file, applying them sequentially from
|
||||
bottom-to-top to minimize line number conflicts.
|
||||
- Handles insertion, text replacement, and deletion.
|
||||
- Implements basic indentation detection (`detect-indent`) and preservation
|
||||
for insertions/replacements.
|
||||
- Uses `diff` library to generate unified diff output.
|
||||
- **Error Handling:**
|
||||
- Uses `try...catch` blocks within each tool handler.
|
||||
- Catches specific Node.js filesystem errors (like `ENOENT`, `EPERM`,
|
||||
`EACCES`) and maps them to appropriate MCP error codes (`InvalidRequest`).
|
||||
- Uses custom `McpError` objects for standardized error reporting back to the
|
||||
agent.
|
||||
- Logs unexpected errors to the server's console (`stderr`) for debugging.
|
||||
- **Glob for Listing/Searching:** Uses the `glob` library for flexible and
|
||||
powerful file listing and searching based on glob patterns, including
|
||||
recursive operations and stat retrieval. Careful handling of `glob`'s
|
||||
different output types based on options (`string[]`, `Path[]`, `Path[]` with
|
||||
`stats`) is implemented.
|
||||
- **TypeScript:** Provides static typing for better code maintainability, early
|
||||
error detection, and improved developer experience. Uses ES module syntax
|
||||
(`import`/`export`).
|
||||
- **PDF Parsing:** Uses Mozilla's `pdfjs-dist` library to load PDF documents and
|
||||
extract text content, metadata, and page information. The `read_pdf` handler
|
||||
uses its API.
|
||||
|
||||
## 3. Component Relationships
|
||||
|
||||
- **`index.ts`:** Main entry point. Sets up the MCP server instance, defines
|
||||
tool schemas, registers request handlers, and starts the server connection.
|
||||
- **`Server` (from SDK):** Core MCP server class handling protocol logic.
|
||||
- **`StdioServerTransport` (from SDK):** Handles reading/writing MCP messages
|
||||
via stdio.
|
||||
- **Tool Handler Function (`handleReadPdfFunc`):** Contains the logic for the
|
||||
consolidated `read_pdf` tool, including Zod argument validation, path
|
||||
resolution, PDF loading/parsing via `pdfjs-dist`, and result formatting based
|
||||
on input parameters.
|
||||
- **`resolvePath` Helper:** Centralized security function for path validation.
|
||||
- **`formatStats` Helper:** Utility to create a consistent stats object
|
||||
structure.
|
||||
- **Node.js Modules (`fs`, `path`):** Used for actual filesystem operations and
|
||||
path manipulation.
|
||||
- **`glob` Library:** Used for pattern-based file searching and listing.
|
||||
- **`zod` Library:** Used for defining and validating tool input schemas.
|
||||
- **`diff` Library:** (Inherited, but not used by PDF tools) Used by
|
||||
`edit_file`.
|
||||
- **`detect-indent` Library:** (Inherited, but not used by PDF tools) Used by
|
||||
`edit_file`.
|
||||
- **`pdfjs-dist` Library:** Used by the `read_pdf` handler to load and process
|
||||
PDF documents.
|
||||
67
pdf-reader-mcp/memory-bank/techContext.md
Normal file
67
pdf-reader-mcp/memory-bank/techContext.md
Normal file
@@ -0,0 +1,67 @@
|
||||
<!-- Version: 1.10 | Last Updated: 2025-04-06 | Updated By: Sylph -->
|
||||
|
||||
# Tech Context: PDF Reader MCP Server
|
||||
|
||||
## 1. Core Technologies
|
||||
|
||||
- **Runtime:** Node.js (>= 18.0.0 recommended)
|
||||
- **Language:** TypeScript (Compiled to JavaScript for execution)
|
||||
- **Package Manager:** pnpm (Switched from npm to align with guidelines)
|
||||
- **Linter:** ESLint (with TypeScript support, including **strict type-aware rules**)
|
||||
- **Formatter:** Prettier
|
||||
- **Testing:** Vitest (with **~95% coverage achieved**)
|
||||
- **Git Hooks:** Husky, lint-staged, commitlint
|
||||
- **Dependency Update:** Dependabot
|
||||
|
||||
## 2. Key Libraries/Dependencies
|
||||
|
||||
- **`@modelcontextprotocol/sdk`:** The official SDK for implementing MCP servers and clients.
|
||||
- **`glob`:** Library for matching files using glob patterns.
|
||||
- **`pdfjs-dist`:** Mozilla's PDF rendering and parsing library.
|
||||
- **`zod`:** Library for schema declaration and validation.
|
||||
- **`zod-to-json-schema`:** Utility to convert Zod schemas to JSON schemas.
|
||||
|
||||
- **Dev Dependencies (Key):**
|
||||
- **`typescript`:** TypeScript compiler (`tsc`).
|
||||
- **`@types/node`:** TypeScript type definitions for Node.js.
|
||||
- **`@types/glob`:** TypeScript type definitions for `glob`.
|
||||
- **`vitest`:** Test runner framework.
|
||||
- **`@vitest/coverage-v8`:** Coverage provider for Vitest.
|
||||
- **`eslint`:** Core ESLint library.
|
||||
- **`typescript-eslint`:** Tools for ESLint + TypeScript integration.
|
||||
- **`prettier`:** Code formatter.
|
||||
- **`eslint-config-prettier`:** Turns off ESLint rules that conflict with Prettier.
|
||||
- **`husky`:** Git hooks manager.
|
||||
- **`lint-staged`:** Run linters on staged files.
|
||||
- **`@commitlint/cli` & `@commitlint/config-conventional`:** Commit message linting.
|
||||
- **`standard-version`:** Release automation tool.
|
||||
- **`typedoc` & `typedoc-plugin-markdown`:** API documentation generation.
|
||||
- **`vitepress` & `vue`:** Documentation website framework.
|
||||
|
||||
## 3. Development Setup
|
||||
|
||||
- **Source Code:** Located in the `src` directory.
|
||||
- **Testing Code:** Located in the `test` directory.
|
||||
- **Main File:** `src/index.ts`.
|
||||
- **Configuration:**
|
||||
- `tsconfig.json`: TypeScript compiler options (**strictest settings enabled**, includes recommended options like `declaration` and `sourceMap`).
|
||||
- `vitest.config.ts`: Vitest test runner configuration (**100% coverage thresholds set**, ~95% achieved).
|
||||
- `eslint.config.js`: ESLint flat configuration (integrates Prettier, enables **strict type-aware linting** and **additional guideline rules**).
|
||||
- `.prettierrc.cjs`: Prettier formatting rules.
|
||||
- `.gitignore`: Specifies intentionally untracked files (`node_modules/`, `build/`, `coverage/`, etc.).
|
||||
- `.github/workflows/ci.yml`: GitHub Actions workflow (validation, publishing, release, **fixed Action versions**, **Coveralls**).
|
||||
- `.github/dependabot.yml`: Automated dependency update configuration.
|
||||
- `package.json`: Project metadata, dependencies, and npm scripts (includes `start`, `typecheck`, `prepare`, `benchmark`, `release`, `clean`, `docs:api`, `prepublishOnly`, etc.).
|
||||
- `commitlint.config.cjs`: Commitlint configuration.
|
||||
- `.husky/`: Directory containing Git hook scripts.
|
||||
- **Build Output:** Compiled JavaScript in the `build` directory.
|
||||
- **Execution:** Run via `node build/index.js` or `npm start`.
|
||||
|
||||
## 4. Technical Constraints & Considerations
|
||||
|
||||
- **Node.js Environment:** Relies on Node.js runtime (>=18.0.0) and built-in modules.
|
||||
- **Permissions:** Server process permissions affect filesystem operations.
|
||||
- **Cross-Platform Compatibility:** Filesystem behaviors might differ. Code uses Node.js `path` module to mitigate.
|
||||
- **Error Handling:** Relies on Node.js error codes and McpError.
|
||||
- **Security Model:** Relies on `resolvePath` for path validation within `PROJECT_ROOT`.
|
||||
- **Project Root Determination:** `PROJECT_ROOT` is the server's `process.cwd()`. The launching process must set this correctly.
|
||||
11303
pdf-reader-mcp/package-lock.json
generated
Normal file
11303
pdf-reader-mcp/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
110
pdf-reader-mcp/package.json
Normal file
110
pdf-reader-mcp/package.json
Normal file
@@ -0,0 +1,110 @@
|
||||
{
|
||||
"name": "@sylphlab/pdf-reader-mcp",
|
||||
"version": "0.3.24",
|
||||
"description": "An MCP server providing tools to read PDF files.",
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"pdf-reader-mcp": "./dist/index.js"
|
||||
},
|
||||
"files": [
|
||||
"dist/",
|
||||
"README.md",
|
||||
"LICENSE"
|
||||
],
|
||||
"publishConfig": {
|
||||
"access": "public"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=22.0.0"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/sylphlab/pdf-reader-mcp.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/sylphlab/pdf-reader-mcp/issues"
|
||||
},
|
||||
"homepage": "https://github.com/sylphlab/pdf-reader-mcp#readme",
|
||||
"author": "Sylph AI <contact@sylphlab.ai> (https://sylphlab.ai)",
|
||||
"license": "MIT",
|
||||
"keywords": [
|
||||
"mcp",
|
||||
"model-context-protocol",
|
||||
"pdf",
|
||||
"reader",
|
||||
"parser",
|
||||
"typescript",
|
||||
"node",
|
||||
"ai",
|
||||
"agent",
|
||||
"tool"
|
||||
],
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"watch": "tsc --watch",
|
||||
"inspector": "npx @modelcontextprotocol/inspector dist/index.js",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest watch",
|
||||
"test:cov": "vitest run --coverage --reporter=junit --outputFile=test-report.junit.xml",
|
||||
"lint": "eslint . --ext .ts,.tsx,.js,.cjs --cache",
|
||||
"lint:fix": "eslint . --ext .ts,.tsx,.js,.cjs --fix --cache",
|
||||
"format": "prettier --write . --cache",
|
||||
"check-format": "prettier --check . --cache",
|
||||
"validate": "npm run check-format && npm run lint && npm run test",
|
||||
"docs:dev": "vitepress dev docs",
|
||||
"docs:build": "vitepress build docs",
|
||||
"docs:preview": "vitepress preview docs",
|
||||
"start": "node dist/index.js",
|
||||
"typecheck": "tsc --noEmit",
|
||||
"benchmark": "vitest bench",
|
||||
"clean": "rm -rf dist coverage",
|
||||
"docs:api": "typedoc --entryPoints src/index.ts --tsconfig tsconfig.json --plugin typedoc-plugin-markdown --out docs/api --readme none",
|
||||
"prepublishOnly": "pnpm run clean && pnpm run build",
|
||||
"release": "standard-version",
|
||||
"prepare": "husky"
|
||||
},
|
||||
"dependencies": {
|
||||
"@modelcontextprotocol/sdk": "1.8.0",
|
||||
"glob": "^11.0.1",
|
||||
"pdfjs-dist": "^5.1.91",
|
||||
"zod": "^3.24.2",
|
||||
"zod-to-json-schema": "^3.24.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@commitlint/cli": "^19.8.0",
|
||||
"@commitlint/config-conventional": "^19.8.0",
|
||||
"@eslint/js": "^9.24.0",
|
||||
"@types/glob": "^8.1.0",
|
||||
"@types/node": "^24.0.7",
|
||||
"@typescript-eslint/eslint-plugin": "^8.29.0",
|
||||
"@typescript-eslint/parser": "^8.29.0",
|
||||
"@vitest/coverage-v8": "^3.1.1",
|
||||
"eslint": "^9.24.0",
|
||||
"eslint-config-prettier": "^10.1.1",
|
||||
"husky": "^9.1.7",
|
||||
"lint-staged": "^15.5.0",
|
||||
"prettier": "^3.5.3",
|
||||
"standard-version": "^9.5.0",
|
||||
"typedoc": "^0.28.2",
|
||||
"typedoc-plugin-markdown": "^4.6.1",
|
||||
"typescript": "^5.8.3",
|
||||
"typescript-eslint": "^8.29.0",
|
||||
"vitepress": "^1.6.3",
|
||||
"vitest": "^3.1.1",
|
||||
"vue": "^3.5.13"
|
||||
},
|
||||
"commitlint": {
|
||||
"extends": [
|
||||
"@commitlint/config-conventional"
|
||||
]
|
||||
},
|
||||
"lint-staged": {
|
||||
"*.{ts,tsx,js,cjs}": [
|
||||
"eslint --fix --cache",
|
||||
"prettier --write --cache"
|
||||
],
|
||||
"*.{json,md,yaml,yml}": [
|
||||
"prettier --write --cache"
|
||||
]
|
||||
}
|
||||
}
|
||||
6417
pdf-reader-mcp/pnpm-lock.yaml
generated
Normal file
6417
pdf-reader-mcp/pnpm-lock.yaml
generated
Normal file
File diff suppressed because it is too large
Load Diff
16
pdf-reader-mcp/src/handlers/index.ts
Normal file
16
pdf-reader-mcp/src/handlers/index.ts
Normal file
@@ -0,0 +1,16 @@
|
||||
// Import only the consolidated PDF tool definition
|
||||
import { readPdfToolDefinition } from './readPdf.js';
|
||||
|
||||
// Define the structure for a tool definition (used internally and for index.ts)
|
||||
// We need Zod here to define the schema type correctly
|
||||
import type { z } from 'zod';
|
||||
export interface ToolDefinition {
|
||||
name: string;
|
||||
description: string;
|
||||
schema: z.ZodType<unknown>; // Use Zod schema type with unknown
|
||||
// Define the specific return type expected by the SDK for tool handlers
|
||||
handler: (args: unknown) => Promise<{ content: { type: string; text: string }[] }>;
|
||||
}
|
||||
|
||||
// Aggregate only the consolidated PDF tool definition
|
||||
export const allToolDefinitions: ToolDefinition[] = [readPdfToolDefinition];
|
||||
441
pdf-reader-mcp/src/handlers/readPdf.ts
Normal file
441
pdf-reader-mcp/src/handlers/readPdf.ts
Normal file
@@ -0,0 +1,441 @@
|
||||
import { z } from 'zod';
|
||||
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import fs from 'node:fs/promises';
|
||||
import { resolvePath } from '../utils/pathUtils.js';
|
||||
import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
|
||||
import type { ToolDefinition } from './index.js';
|
||||
|
||||
// Helper to parse page range strings (e.g., "1-3,5,7-")
|
||||
// Helper to parse a single range part (e.g., "1-3", "5", "7-")
|
||||
const parseRangePart = (part: string, pages: Set<number>): void => {
|
||||
const trimmedPart = part.trim();
|
||||
if (trimmedPart.includes('-')) {
|
||||
const [startStr, endStr] = trimmedPart.split('-');
|
||||
if (startStr === undefined) {
|
||||
// Basic check
|
||||
throw new Error(`Invalid page range format: ${trimmedPart}`);
|
||||
}
|
||||
const start = parseInt(startStr, 10);
|
||||
const end = endStr === '' || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
||||
|
||||
if (isNaN(start) || isNaN(end) || start <= 0 || start > end) {
|
||||
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
||||
}
|
||||
|
||||
// Add a reasonable upper limit to prevent infinite loops for open ranges
|
||||
const practicalEnd = Math.min(end, start + 10000); // Limit range parsing depth
|
||||
for (let i = start; i <= practicalEnd; i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
if (end === Infinity && practicalEnd === start + 10000) {
|
||||
console.warn(
|
||||
`[PDF Reader MCP] Open-ended range starting at ${String(start)} was truncated at page ${String(practicalEnd)} during parsing.`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const page = parseInt(trimmedPart, 10);
|
||||
if (isNaN(page) || page <= 0) {
|
||||
throw new Error(`Invalid page number: ${trimmedPart}`);
|
||||
}
|
||||
pages.add(page);
|
||||
}
|
||||
};
|
||||
|
||||
// Parses the complete page range string (e.g., "1-3,5,7-")
|
||||
const parsePageRanges = (ranges: string): number[] => {
|
||||
const pages = new Set<number>();
|
||||
const parts = ranges.split(',');
|
||||
for (const part of parts) {
|
||||
parseRangePart(part, pages); // Delegate parsing of each part
|
||||
}
|
||||
if (pages.size === 0) {
|
||||
throw new Error('Page range string resulted in zero valid pages.');
|
||||
}
|
||||
return Array.from(pages).sort((a, b) => a - b);
|
||||
};
|
||||
|
||||
// --- Zod Schemas ---
|
||||
const pageSpecifierSchema = z.union([
|
||||
z.array(z.number().int().positive()).min(1), // Array of positive integers
|
||||
z
|
||||
.string()
|
||||
.min(1)
|
||||
.refine((val) => /^[0-9,-]+$/.test(val.replace(/\s/g, '')), {
|
||||
// Allow spaces but test without them
|
||||
message: 'Page string must contain only numbers, commas, and hyphens.',
|
||||
}),
|
||||
]);
|
||||
|
||||
const PdfSourceSchema = z
|
||||
.object({
|
||||
path: z.string().min(1).optional().describe('Relative path to the local PDF file.'),
|
||||
url: z.string().url().optional().describe('URL of the PDF file.'),
|
||||
pages: pageSpecifierSchema
|
||||
.optional()
|
||||
.describe(
|
||||
"Extract text only from specific pages (1-based) or ranges for *this specific source*. If provided, 'include_full_text' for the entire request is ignored for this source."
|
||||
),
|
||||
})
|
||||
.strict()
|
||||
.refine((data) => !!(data.path && !data.url) || !!(!data.path && data.url), {
|
||||
// Use boolean coercion instead of || for truthiness check if needed, though refine expects boolean
|
||||
message: "Each source must have either 'path' or 'url', but not both.",
|
||||
});
|
||||
|
||||
const ReadPdfArgsSchema = z
|
||||
.object({
|
||||
sources: z
|
||||
.array(PdfSourceSchema)
|
||||
.min(1)
|
||||
.describe('An array of PDF sources to process, each can optionally specify pages.'),
|
||||
include_full_text: z
|
||||
.boolean()
|
||||
.optional()
|
||||
.default(false)
|
||||
.describe(
|
||||
"Include the full text content of each PDF (only if 'pages' is not specified for that source)."
|
||||
),
|
||||
include_metadata: z
|
||||
.boolean()
|
||||
.optional()
|
||||
.default(true)
|
||||
.describe('Include metadata and info objects for each PDF.'),
|
||||
include_page_count: z
|
||||
.boolean()
|
||||
.optional()
|
||||
.default(true)
|
||||
.describe('Include the total number of pages for each PDF.'),
|
||||
})
|
||||
.strict();
|
||||
|
||||
type ReadPdfArgs = z.infer<typeof ReadPdfArgsSchema>;
|
||||
|
||||
// --- Result Type Interfaces ---
|
||||
interface PdfInfo {
|
||||
PDFFormatVersion?: string;
|
||||
IsLinearized?: boolean;
|
||||
IsAcroFormPresent?: boolean;
|
||||
IsXFAPresent?: boolean;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
type PdfMetadata = Record<string, unknown>; // Use Record for better type safety
|
||||
|
||||
interface ExtractedPageText {
|
||||
page: number;
|
||||
text: string;
|
||||
}
|
||||
|
||||
interface PdfResultData {
|
||||
info?: PdfInfo;
|
||||
metadata?: PdfMetadata;
|
||||
num_pages?: number;
|
||||
full_text?: string;
|
||||
page_texts?: ExtractedPageText[];
|
||||
warnings?: string[];
|
||||
}
|
||||
|
||||
interface PdfSourceResult {
|
||||
source: string;
|
||||
success: boolean;
|
||||
data?: PdfResultData;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// --- Helper Functions ---
|
||||
|
||||
// Parses the page specification for a single source
|
||||
const getTargetPages = (
|
||||
sourcePages: string | number[] | undefined,
|
||||
sourceDescription: string
|
||||
): number[] | undefined => {
|
||||
if (!sourcePages) {
|
||||
return undefined;
|
||||
}
|
||||
try {
|
||||
let targetPages: number[];
|
||||
if (typeof sourcePages === 'string') {
|
||||
targetPages = parsePageRanges(sourcePages);
|
||||
} else {
|
||||
// Ensure array elements are positive integers
|
||||
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
||||
throw new Error('Page numbers in array must be positive integers.');
|
||||
}
|
||||
targetPages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
||||
}
|
||||
if (targetPages.length === 0) {
|
||||
// Check after potential Set deduplication
|
||||
throw new Error('Page specification resulted in an empty set of pages.');
|
||||
}
|
||||
return targetPages;
|
||||
} catch (error: unknown) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
// Throw McpError for invalid page specs caught during parsing
|
||||
throw new McpError(
|
||||
ErrorCode.InvalidParams,
|
||||
`Invalid page specification for source ${sourceDescription}: ${message}`
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// Loads the PDF document from path or URL
|
||||
const loadPdfDocument = async (
|
||||
source: { path?: string | undefined; url?: string | undefined }, // Explicitly allow undefined
|
||||
sourceDescription: string
|
||||
): Promise<pdfjsLib.PDFDocumentProxy> => {
|
||||
let pdfDataSource: Buffer | { url: string };
|
||||
try {
|
||||
if (source.path) {
|
||||
const safePath = resolvePath(source.path); // resolvePath handles security checks
|
||||
pdfDataSource = await fs.readFile(safePath);
|
||||
} else if (source.url) {
|
||||
pdfDataSource = { url: source.url };
|
||||
} else {
|
||||
// This case should be caught by Zod, but added for robustness
|
||||
throw new McpError(
|
||||
ErrorCode.InvalidParams,
|
||||
`Source ${sourceDescription} missing 'path' or 'url'.`
|
||||
);
|
||||
}
|
||||
} catch (err: unknown) {
|
||||
// Handle errors during path resolution or file reading
|
||||
let errorMessage: string; // Declare errorMessage here
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
const errorCode = ErrorCode.InvalidRequest; // Default error code
|
||||
|
||||
if (
|
||||
typeof err === 'object' &&
|
||||
err !== null &&
|
||||
'code' in err &&
|
||||
err.code === 'ENOENT' &&
|
||||
source.path
|
||||
) {
|
||||
// Specific handling for file not found
|
||||
errorMessage = `File not found at '${source.path}'.`;
|
||||
// Optionally keep errorCode as InvalidRequest or change if needed
|
||||
} else {
|
||||
// Generic error for other file prep issues or resolvePath errors
|
||||
errorMessage = `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`;
|
||||
}
|
||||
throw new McpError(errorCode, errorMessage, { cause: err instanceof Error ? err : undefined });
|
||||
}
|
||||
|
||||
const loadingTask = pdfjsLib.getDocument(pdfDataSource);
|
||||
try {
|
||||
return await loadingTask.promise;
|
||||
} catch (err: unknown) {
|
||||
console.error(`[PDF Reader MCP] PDF.js loading error for ${sourceDescription}:`, err);
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
// Use ?? for default message
|
||||
throw new McpError(
|
||||
ErrorCode.InvalidRequest,
|
||||
`Failed to load PDF document from ${sourceDescription}. Reason: ${message || 'Unknown loading error'}`, // Revert to || as message is likely always string here
|
||||
{ cause: err instanceof Error ? err : undefined }
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// Extracts metadata and page count
|
||||
const extractMetadataAndPageCount = async (
|
||||
pdfDocument: pdfjsLib.PDFDocumentProxy,
|
||||
includeMetadata: boolean,
|
||||
includePageCount: boolean
|
||||
): Promise<Pick<PdfResultData, 'info' | 'metadata' | 'num_pages'>> => {
|
||||
const output: Pick<PdfResultData, 'info' | 'metadata' | 'num_pages'> = {};
|
||||
if (includePageCount) {
|
||||
output.num_pages = pdfDocument.numPages;
|
||||
}
|
||||
if (includeMetadata) {
|
||||
try {
|
||||
const pdfMetadata = await pdfDocument.getMetadata();
|
||||
const infoData = pdfMetadata.info as PdfInfo | undefined;
|
||||
if (infoData !== undefined) {
|
||||
output.info = infoData;
|
||||
}
|
||||
const metadataObj = pdfMetadata.metadata;
|
||||
// 使用更安全的方式获取元数据,避免类型错误
|
||||
const metadataData = metadataObj ? (metadataObj as any).getAll?.() as PdfMetadata | undefined : undefined;
|
||||
if (metadataData !== undefined) {
|
||||
output.metadata = metadataData;
|
||||
}
|
||||
} catch (metaError: unknown) {
|
||||
console.warn(
|
||||
`[PDF Reader MCP] Error extracting metadata: ${metaError instanceof Error ? metaError.message : String(metaError)}`
|
||||
);
|
||||
// Optionally add a warning to the result if metadata extraction fails partially
|
||||
}
|
||||
}
|
||||
return output;
|
||||
};
|
||||
|
||||
// Extracts text from specified pages
|
||||
const extractPageTexts = async (
|
||||
pdfDocument: pdfjsLib.PDFDocumentProxy,
|
||||
pagesToProcess: number[],
|
||||
sourceDescription: string
|
||||
): Promise<ExtractedPageText[]> => {
|
||||
const extractedPageTexts: ExtractedPageText[] = [];
|
||||
for (const pageNum of pagesToProcess) {
|
||||
let pageText = '';
|
||||
try {
|
||||
const page = await pdfDocument.getPage(pageNum);
|
||||
const textContent = await page.getTextContent();
|
||||
pageText = textContent.items
|
||||
.map((item: unknown) => (item as { str: string }).str) // Type assertion
|
||||
.join('');
|
||||
} catch (pageError: unknown) {
|
||||
const message = pageError instanceof Error ? pageError.message : String(pageError);
|
||||
console.warn(
|
||||
`[PDF Reader MCP] Error getting text content for page ${String(pageNum)} in ${sourceDescription}: ${message}` // Explicit string conversion
|
||||
);
|
||||
pageText = `Error processing page: ${message}`; // Include error in text
|
||||
}
|
||||
extractedPageTexts.push({ page: pageNum, text: pageText });
|
||||
}
|
||||
// Sorting is likely unnecessary if pagesToProcess was sorted, but keep for safety
|
||||
extractedPageTexts.sort((a, b) => a.page - b.page);
|
||||
return extractedPageTexts;
|
||||
};
|
||||
|
||||
// Determines the actual list of pages to process based on target pages and total pages
|
||||
const determinePagesToProcess = (
|
||||
targetPages: number[] | undefined,
|
||||
totalPages: number,
|
||||
includeFullText: boolean
|
||||
): { pagesToProcess: number[]; invalidPages: number[] } => {
|
||||
let pagesToProcess: number[] = [];
|
||||
let invalidPages: number[] = [];
|
||||
|
||||
if (targetPages) {
|
||||
// Filter target pages based on actual total pages
|
||||
pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
||||
invalidPages = targetPages.filter((p) => p > totalPages);
|
||||
} else if (includeFullText) {
|
||||
// If no specific pages requested for this source, use global flag
|
||||
pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
||||
}
|
||||
return { pagesToProcess, invalidPages };
|
||||
};
|
||||
|
||||
// Processes a single PDF source
|
||||
const processSingleSource = async (
|
||||
source: z.infer<typeof PdfSourceSchema>,
|
||||
globalIncludeFullText: boolean,
|
||||
globalIncludeMetadata: boolean,
|
||||
globalIncludePageCount: boolean
|
||||
): Promise<PdfSourceResult> => {
|
||||
const sourceDescription: string = source.path ?? source.url ?? 'unknown source';
|
||||
let individualResult: PdfSourceResult = { source: sourceDescription, success: false };
|
||||
|
||||
try {
|
||||
// 1. Parse target pages for this source (throws McpError on invalid spec)
|
||||
const targetPages = getTargetPages(source.pages, sourceDescription);
|
||||
|
||||
// 2. Load PDF Document (throws McpError on loading failure)
|
||||
// Destructure to remove 'pages' before passing to loadPdfDocument due to exactOptionalPropertyTypes
|
||||
const { pages: _pages, ...loadArgs } = source;
|
||||
const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
|
||||
const totalPages = pdfDocument.numPages;
|
||||
|
||||
// 3. Extract Metadata & Page Count
|
||||
const metadataOutput = await extractMetadataAndPageCount(
|
||||
pdfDocument,
|
||||
globalIncludeMetadata,
|
||||
globalIncludePageCount
|
||||
);
|
||||
const output: PdfResultData = { ...metadataOutput }; // Start building output
|
||||
|
||||
// 4. Determine actual pages to process
|
||||
const { pagesToProcess, invalidPages } = determinePagesToProcess(
|
||||
targetPages,
|
||||
totalPages,
|
||||
globalIncludeFullText // Pass the global flag
|
||||
);
|
||||
|
||||
// Add warnings for invalid requested pages
|
||||
if (invalidPages.length > 0) {
|
||||
output.warnings = output.warnings ?? [];
|
||||
output.warnings.push(
|
||||
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`
|
||||
);
|
||||
}
|
||||
|
||||
// 5. Extract Text (if needed)
|
||||
if (pagesToProcess.length > 0) {
|
||||
const extractedPageTexts = await extractPageTexts(
|
||||
pdfDocument,
|
||||
pagesToProcess,
|
||||
sourceDescription
|
||||
);
|
||||
if (targetPages) {
|
||||
// If specific pages were requested for *this source*
|
||||
output.page_texts = extractedPageTexts;
|
||||
} else {
|
||||
// Only assign full_text if pages were NOT specified for this source
|
||||
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
|
||||
}
|
||||
}
|
||||
|
||||
individualResult = { ...individualResult, data: output, success: true };
|
||||
} catch (error: unknown) {
|
||||
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
|
||||
if (error instanceof McpError) {
|
||||
errorMessage = error.message; // Use message from McpError directly
|
||||
} else if (error instanceof Error) {
|
||||
errorMessage += ` Reason: ${error.message}`;
|
||||
} else {
|
||||
errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
|
||||
}
|
||||
individualResult.error = errorMessage;
|
||||
individualResult.success = false;
|
||||
delete individualResult.data; // Ensure no data on error
|
||||
}
|
||||
return individualResult;
|
||||
};
|
||||
|
||||
// --- Main Handler Function ---
|
||||
export const handleReadPdfFunc = async (
|
||||
args: unknown
|
||||
): Promise<{ content: { type: string; text: string }[] }> => {
|
||||
let parsedArgs: ReadPdfArgs;
|
||||
try {
|
||||
parsedArgs = ReadPdfArgsSchema.parse(args);
|
||||
} catch (error: unknown) {
|
||||
if (error instanceof z.ZodError) {
|
||||
throw new McpError(
|
||||
ErrorCode.InvalidParams,
|
||||
`Invalid arguments: ${error.errors.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`
|
||||
);
|
||||
}
|
||||
// Added fallback for non-Zod errors during parsing
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
|
||||
}
|
||||
|
||||
const { sources, include_full_text, include_metadata, include_page_count } = parsedArgs;
|
||||
|
||||
// Process all sources concurrently
|
||||
const results = await Promise.all(
|
||||
sources.map((source) =>
|
||||
processSingleSource(source, include_full_text, include_metadata, include_page_count)
|
||||
)
|
||||
);
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify({ results }, null, 2),
|
||||
},
|
||||
],
|
||||
};
|
||||
};
|
||||
|
||||
// Export the consolidated ToolDefinition
|
||||
export const readPdfToolDefinition: ToolDefinition = {
|
||||
name: 'read_pdf',
|
||||
description:
|
||||
'Reads content/metadata from one or more PDFs (local/URL). Each source can specify pages to extract.',
|
||||
schema: ReadPdfArgsSchema,
|
||||
handler: handleReadPdfFunc,
|
||||
};
|
||||
78
pdf-reader-mcp/src/index.ts
Normal file
78
pdf-reader-mcp/src/index.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
||||
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
||||
import type { z } from 'zod'; // Import Zod
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema';
|
||||
import {
|
||||
CallToolRequestSchema,
|
||||
ListToolsRequestSchema,
|
||||
McpError,
|
||||
ErrorCode,
|
||||
} from '@modelcontextprotocol/sdk/types.js';
|
||||
// Import the aggregated tool definitions
|
||||
import { allToolDefinitions } from './handlers/index.js';
|
||||
// Removed incorrect import left over from partial diff
|
||||
|
||||
// --- Tool Names (Constants) ---
|
||||
// Removed tool name constants, names are now in the definitions
|
||||
|
||||
// --- Server Setup ---
|
||||
|
||||
const server = new Server(
|
||||
{
|
||||
name: 'filesystem-mcp',
|
||||
version: '0.4.0', // Increment version for definition refactor
|
||||
description: 'MCP Server for filesystem operations relative to the project root.',
|
||||
},
|
||||
{
|
||||
capabilities: { tools: {} },
|
||||
}
|
||||
);
|
||||
|
||||
// Helper function to convert Zod schema to JSON schema for MCP
|
||||
// Use 'unknown' instead of 'any' for better type safety, although casting is still needed for the SDK
|
||||
const generateInputSchema = (schema: z.ZodType<unknown>): object => {
|
||||
// Need to cast as 'unknown' then 'object' because zodToJsonSchema might return slightly incompatible types for MCP SDK
|
||||
return zodToJsonSchema(schema, { target: 'openApi3' }) as unknown as object;
|
||||
};
|
||||
|
||||
server.setRequestHandler(ListToolsRequestSchema, () => {
|
||||
// Removed unnecessary async
|
||||
// Removed log
|
||||
// Map the aggregated definitions to the format expected by the SDK
|
||||
const availableTools = allToolDefinitions.map((def) => ({
|
||||
name: def.name,
|
||||
description: def.description,
|
||||
inputSchema: generateInputSchema(def.schema), // Generate JSON schema from Zod schema
|
||||
}));
|
||||
return { tools: availableTools };
|
||||
});
|
||||
|
||||
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
||||
// Use imported handlers
|
||||
// Find the tool definition by name and call its handler
|
||||
const toolDefinition = allToolDefinitions.find((def) => def.name === request.params.name);
|
||||
|
||||
if (!toolDefinition) {
|
||||
throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
|
||||
}
|
||||
|
||||
// Call the handler associated with the found definition
|
||||
// The handler itself will perform Zod validation on the arguments
|
||||
return toolDefinition.handler(request.params.arguments);
|
||||
});
|
||||
|
||||
// --- Server Start ---
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const transport = new StdioServerTransport();
|
||||
await server.connect(transport);
|
||||
console.error('[Filesystem MCP] Server running on stdio');
|
||||
}
|
||||
|
||||
main().catch((error: unknown) => {
|
||||
// Specify 'unknown' type for catch variable
|
||||
console.error('[Filesystem MCP] Server error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
33
pdf-reader-mcp/src/utils/pathUtils.ts
Normal file
33
pdf-reader-mcp/src/utils/pathUtils.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import path from 'path';
|
||||
// Removed unused import: import { fileURLToPath } from 'url';
|
||||
import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
|
||||
|
||||
// Use the server's current working directory as the project root.
|
||||
// This relies on the process launching the server to set the CWD correctly.
|
||||
export const PROJECT_ROOT = process.cwd();
|
||||
|
||||
console.info(`[Filesystem MCP - pathUtils] Project Root determined from CWD: ${PROJECT_ROOT}`); // Use info instead of log
|
||||
|
||||
/**
|
||||
* Resolves a user-provided relative path against the project root,
|
||||
* ensuring it stays within the project boundaries.
|
||||
* Throws McpError on invalid input, absolute paths, or path traversal.
|
||||
* @param userPath The relative path provided by the user.
|
||||
* @returns The resolved absolute path.
|
||||
*/
|
||||
export const resolvePath = (userPath: string): string => {
|
||||
if (typeof userPath !== 'string') {
|
||||
throw new McpError(ErrorCode.InvalidParams, 'Path must be a string.');
|
||||
}
|
||||
const normalizedUserPath = path.normalize(userPath);
|
||||
if (path.isAbsolute(normalizedUserPath)) {
|
||||
throw new McpError(ErrorCode.InvalidParams, 'Absolute paths are not allowed.');
|
||||
}
|
||||
// Resolve against the calculated PROJECT_ROOT
|
||||
const resolved = path.resolve(PROJECT_ROOT, normalizedUserPath);
|
||||
// Security check: Ensure the resolved path is still within the project root
|
||||
if (!resolved.startsWith(PROJECT_ROOT)) {
|
||||
throw new McpError(ErrorCode.InvalidRequest, 'Path traversal detected. Access denied.');
|
||||
}
|
||||
return resolved;
|
||||
};
|
||||
135
pdf-reader-mcp/test/benchmark/readPdf.bench.ts
Normal file
135
pdf-reader-mcp/test/benchmark/readPdf.bench.ts
Normal file
@@ -0,0 +1,135 @@
|
||||
import { describe, bench, vi as _vi } from 'vitest'; // Prefix unused import
|
||||
import { handleReadPdfFunc } from '../../src/handlers/readPdf'; // Adjust path as needed
|
||||
import path from 'node:path';
|
||||
import fs from 'node:fs/promises';
|
||||
|
||||
// Mock the project root - Vitest runs from the project root by default
|
||||
const PROJECT_ROOT = process.cwd();
|
||||
const SAMPLE_PDF_PATH = 'test/fixtures/sample.pdf'; // Relative path to test PDF
|
||||
|
||||
// Pre-check if the sample PDF exists to avoid errors during benchmark setup
|
||||
let pdfExists = false;
|
||||
try {
|
||||
await fs.access(path.resolve(PROJECT_ROOT, SAMPLE_PDF_PATH));
|
||||
pdfExists = true;
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
// Check if error is an instance of Error before accessing message
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.warn(
|
||||
`Warning: Sample PDF not found at ${SAMPLE_PDF_PATH}. Benchmarks requiring it will be skipped. Details: ${message}`
|
||||
);
|
||||
}
|
||||
|
||||
describe('read_pdf Handler Benchmarks', () => {
|
||||
// Benchmark getting only metadata and page count
|
||||
bench(
|
||||
'Get Metadata & Page Count',
|
||||
async () => {
|
||||
if (!pdfExists) return; // Skip if PDF doesn't exist
|
||||
try {
|
||||
await handleReadPdfFunc({
|
||||
sources: [{ path: SAMPLE_PDF_PATH }],
|
||||
include_metadata: true,
|
||||
include_page_count: true,
|
||||
include_full_text: false,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
console.warn(
|
||||
`Benchmark 'Get Metadata & Page Count' failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
},
|
||||
{ time: 1000 }
|
||||
); // Run for 1 second
|
||||
|
||||
// Benchmark getting full text
|
||||
bench(
|
||||
'Get Full Text',
|
||||
async () => {
|
||||
if (!pdfExists) return;
|
||||
try {
|
||||
await handleReadPdfFunc({
|
||||
sources: [{ path: SAMPLE_PDF_PATH }],
|
||||
include_metadata: false,
|
||||
include_page_count: false,
|
||||
include_full_text: true,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
console.warn(
|
||||
`Benchmark 'Get Full Text' failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
},
|
||||
{ time: 1000 }
|
||||
);
|
||||
|
||||
// Benchmark getting specific pages (e.g., page 1)
|
||||
bench(
|
||||
'Get Specific Page (Page 1)',
|
||||
async () => {
|
||||
if (!pdfExists) return;
|
||||
try {
|
||||
await handleReadPdfFunc({
|
||||
sources: [{ path: SAMPLE_PDF_PATH, pages: [1] }],
|
||||
include_metadata: false,
|
||||
include_page_count: false,
|
||||
include_full_text: false, // Should be ignored when pages is set
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
console.warn(
|
||||
`Benchmark 'Get Specific Page (Page 1)' failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
},
|
||||
{ time: 1000 }
|
||||
);
|
||||
|
||||
// Benchmark getting multiple specific pages (e.g., pages 1 & 2)
|
||||
bench(
|
||||
'Get Specific Pages (Pages 1 & 2)',
|
||||
async () => {
|
||||
if (!pdfExists) return;
|
||||
// Assuming sample.pdf has at least 2 pages
|
||||
try {
|
||||
await handleReadPdfFunc({
|
||||
sources: [{ path: SAMPLE_PDF_PATH, pages: [1, 2] }],
|
||||
include_metadata: false,
|
||||
include_page_count: false,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
console.warn(
|
||||
`Benchmark 'Get Specific Pages (Pages 1 & 2)' failed: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
},
|
||||
{ time: 1000 }
|
||||
);
|
||||
|
||||
// Benchmark handling a non-existent file (error path)
|
||||
bench(
|
||||
'Handle Non-Existent File',
|
||||
async () => {
|
||||
try {
|
||||
await handleReadPdfFunc({
|
||||
sources: [{ path: 'non/existent/file.pdf' }],
|
||||
include_metadata: true,
|
||||
include_page_count: true,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Explicitly type error as unknown
|
||||
// Expecting an error here, but log if something unexpected happens during the benchmark itself
|
||||
console.warn(
|
||||
`Benchmark 'Handle Non-Existent File' unexpectedly failed internally: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
},
|
||||
{ time: 1000 }
|
||||
);
|
||||
|
||||
// Add more benchmarks as needed (e.g., larger PDFs, URL sources if feasible in benchmark)
|
||||
});
|
||||
BIN
pdf-reader-mcp/test/fixtures/sample.pdf
vendored
Normal file
BIN
pdf-reader-mcp/test/fixtures/sample.pdf
vendored
Normal file
Binary file not shown.
753
pdf-reader-mcp/test/handlers/readPdf.test.ts
Normal file
753
pdf-reader-mcp/test/handlers/readPdf.test.ts
Normal file
@@ -0,0 +1,753 @@
|
||||
import { describe, it, expect, vi, beforeEach, beforeAll } from 'vitest';
|
||||
import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
|
||||
import { resolvePath } from '../../src/utils/pathUtils.js';
|
||||
import * as pathUtils from '../../src/utils/pathUtils.js'; // Import the module itself for spying
|
||||
|
||||
// Define a type for the expected structure after JSON.parse
|
||||
interface ExpectedResultType {
|
||||
results: { source: string; success: boolean; data?: object; error?: string }[];
|
||||
}
|
||||
|
||||
// --- Mocking pdfjs-dist ---
|
||||
const mockGetMetadata = vi.fn();
|
||||
const mockGetPage = vi.fn();
|
||||
const mockGetDocument = vi.fn();
|
||||
const mockReadFile = vi.fn();
|
||||
|
||||
vi.doMock('pdfjs-dist/legacy/build/pdf.mjs', () => {
|
||||
return {
|
||||
getDocument: mockGetDocument,
|
||||
};
|
||||
});
|
||||
vi.doMock('node:fs/promises', () => {
|
||||
return {
|
||||
default: {
|
||||
readFile: mockReadFile,
|
||||
},
|
||||
readFile: mockReadFile,
|
||||
__esModule: true,
|
||||
};
|
||||
});
|
||||
|
||||
// Dynamically import the handler *once* after mocks are defined
|
||||
// Define a more specific type for the handler's return value content
|
||||
interface HandlerResultContent {
|
||||
type: string;
|
||||
text: string;
|
||||
}
|
||||
let handler: (args: unknown) => Promise<{ content: HandlerResultContent[] }>;
|
||||
|
||||
beforeAll(async () => {
|
||||
// Only import the tool definition now
|
||||
const { readPdfToolDefinition: importedDefinition } = await import(
|
||||
'../../src/handlers/readPdf.js'
|
||||
);
|
||||
handler = importedDefinition.handler;
|
||||
});
|
||||
|
||||
// Renamed describe block as it now only tests the handler
|
||||
describe('handleReadPdfFunc Integration Tests', () => {
|
||||
beforeEach(() => {
|
||||
vi.resetAllMocks();
|
||||
// Reset mocks for pathUtils if we spy on it
|
||||
vi.spyOn(pathUtils, 'resolvePath').mockImplementation((p) => p); // Simple mock for resolvePath
|
||||
|
||||
mockReadFile.mockResolvedValue(Buffer.from('mock pdf content'));
|
||||
|
||||
const mockDocumentAPI = {
|
||||
numPages: 3,
|
||||
getMetadata: mockGetMetadata,
|
||||
getPage: mockGetPage,
|
||||
};
|
||||
const mockLoadingTaskAPI = { promise: Promise.resolve(mockDocumentAPI) };
|
||||
mockGetDocument.mockReturnValue(mockLoadingTaskAPI);
|
||||
mockGetMetadata.mockResolvedValue({
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: {
|
||||
_metadataMap: new Map([['dc:format', 'application/pdf']]),
|
||||
get(key: string) {
|
||||
return this._metadataMap.get(key);
|
||||
},
|
||||
has(key: string) {
|
||||
return this._metadataMap.has(key);
|
||||
},
|
||||
getAll() {
|
||||
return Object.fromEntries(this._metadataMap);
|
||||
},
|
||||
},
|
||||
});
|
||||
// Removed unnecessary async and eslint-disable comment
|
||||
mockGetPage.mockImplementation((pageNum: number) => {
|
||||
if (pageNum > 0 && pageNum <= mockDocumentAPI.numPages) {
|
||||
return {
|
||||
getTextContent: vi
|
||||
.fn()
|
||||
.mockResolvedValueOnce({ items: [{ str: `Mock page text ${String(pageNum)}` }] }),
|
||||
};
|
||||
}
|
||||
throw new Error(`Mock getPage error: Invalid page number ${String(pageNum)}`);
|
||||
});
|
||||
});
|
||||
|
||||
// Removed unit tests for parsePageRanges
|
||||
|
||||
// --- Integration Tests for handleReadPdfFunc ---
|
||||
|
||||
it('should successfully read full text, metadata, and page count for a local file', async () => {
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf' }],
|
||||
include_full_text: true,
|
||||
include_metadata: true,
|
||||
include_page_count: true,
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'test.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
num_pages: 3,
|
||||
full_text: 'Mock page text 1\n\nMock page text 2\n\nMock page text 3',
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
expect(mockReadFile).toHaveBeenCalledWith(resolvePath('test.pdf'));
|
||||
expect(mockGetDocument).toHaveBeenCalledWith(Buffer.from('mock pdf content'));
|
||||
expect(mockGetMetadata).toHaveBeenCalled();
|
||||
expect(mockGetPage).toHaveBeenCalledTimes(3);
|
||||
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(result.content[0].type).toBe('text');
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should successfully read specific pages for a local file', async () => {
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf', pages: [1, 3] }],
|
||||
include_metadata: false,
|
||||
include_page_count: true,
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'test.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
num_pages: 3,
|
||||
page_texts: [
|
||||
{ page: 1, text: 'Mock page text 1' },
|
||||
{ page: 3, text: 'Mock page text 3' },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(mockGetPage).toHaveBeenCalledTimes(2);
|
||||
expect(mockGetPage).toHaveBeenCalledWith(1);
|
||||
expect(mockGetPage).toHaveBeenCalledWith(3);
|
||||
expect(mockReadFile).toHaveBeenCalledWith(resolvePath('test.pdf'));
|
||||
expect(mockGetDocument).toHaveBeenCalledWith(Buffer.from('mock pdf content'));
|
||||
expect(mockGetMetadata).not.toHaveBeenCalled();
|
||||
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(result.content[0].type).toBe('text');
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should successfully read specific pages using string range', async () => {
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf', pages: '1,3-3' }],
|
||||
include_page_count: true,
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'test.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
num_pages: 3,
|
||||
page_texts: [
|
||||
{ page: 1, text: 'Mock page text 1' },
|
||||
{ page: 3, text: 'Mock page text 3' },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should successfully read metadata only for a URL', async () => {
|
||||
const testUrl = 'http://example.com/test.pdf';
|
||||
const args = {
|
||||
sources: [{ url: testUrl }],
|
||||
include_full_text: false,
|
||||
include_metadata: true,
|
||||
include_page_count: false,
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: testUrl,
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(mockReadFile).not.toHaveBeenCalled();
|
||||
expect(mockGetDocument).toHaveBeenCalledWith({ url: testUrl });
|
||||
expect(mockGetMetadata).toHaveBeenCalled();
|
||||
expect(mockGetPage).not.toHaveBeenCalled();
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(result.content[0].type).toBe('text');
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle multiple sources with different options', async () => {
|
||||
const urlSource = 'http://example.com/another.pdf';
|
||||
const args = {
|
||||
sources: [{ path: 'local.pdf', pages: [1] }, { url: urlSource }],
|
||||
include_full_text: true,
|
||||
include_metadata: true,
|
||||
include_page_count: true,
|
||||
};
|
||||
// Setup mocks for the second source (URL)
|
||||
const secondMockGetPage = vi.fn().mockImplementation((pageNum: number) => {
|
||||
// Removed unnecessary async
|
||||
if (pageNum === 1)
|
||||
return {
|
||||
getTextContent: vi.fn().mockResolvedValue({ items: [{ str: 'URL Mock page text 1' }] }),
|
||||
};
|
||||
if (pageNum === 2)
|
||||
return {
|
||||
getTextContent: vi.fn().mockResolvedValue({ items: [{ str: 'URL Mock page text 2' }] }),
|
||||
};
|
||||
throw new Error(`Mock getPage error: Invalid page number ${String(pageNum)}`);
|
||||
});
|
||||
const secondMockGetMetadata = vi.fn().mockResolvedValue({
|
||||
// Separate metadata mock if needed
|
||||
info: { Title: 'URL PDF' },
|
||||
metadata: { getAll: () => ({ 'dc:creator': 'URL Author' }) },
|
||||
});
|
||||
const secondMockDocumentAPI = {
|
||||
numPages: 2,
|
||||
getMetadata: secondMockGetMetadata, // Use separate metadata mock
|
||||
getPage: secondMockGetPage,
|
||||
};
|
||||
const secondLoadingTaskAPI = { promise: Promise.resolve(secondMockDocumentAPI) };
|
||||
|
||||
// Reset getDocument mock before setting implementation
|
||||
mockGetDocument.mockReset();
|
||||
// Mock getDocument based on input source
|
||||
mockGetDocument.mockImplementation((source: Buffer | { url: string }) => {
|
||||
// Check if source is not a Buffer and has the matching url property
|
||||
if (typeof source === 'object' && !Buffer.isBuffer(source) && source.url === urlSource) {
|
||||
return secondLoadingTaskAPI;
|
||||
}
|
||||
// Default mock for path-based source (local.pdf)
|
||||
const defaultMockDocumentAPI = {
|
||||
numPages: 3,
|
||||
getMetadata: mockGetMetadata, // Use original metadata mock
|
||||
getPage: mockGetPage, // Use original page mock
|
||||
};
|
||||
return { promise: Promise.resolve(defaultMockDocumentAPI) };
|
||||
});
|
||||
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'local.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
num_pages: 3,
|
||||
page_texts: [{ page: 1, text: 'Mock page text 1' }],
|
||||
},
|
||||
},
|
||||
{
|
||||
source: urlSource,
|
||||
success: true,
|
||||
data: {
|
||||
// Use the metadata returned by secondMockGetMetadata
|
||||
info: { Title: 'URL PDF' },
|
||||
metadata: { 'dc:creator': 'URL Author' },
|
||||
num_pages: 2,
|
||||
full_text: 'URL Mock page text 1\n\nURL Mock page text 2',
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(mockReadFile).toHaveBeenCalledOnce();
|
||||
expect(mockReadFile).toHaveBeenCalledWith(resolvePath('local.pdf'));
|
||||
expect(mockGetDocument).toHaveBeenCalledTimes(2);
|
||||
expect(mockGetDocument).toHaveBeenCalledWith(Buffer.from('mock pdf content'));
|
||||
expect(mockGetDocument).toHaveBeenCalledWith({ url: urlSource });
|
||||
expect(mockGetPage).toHaveBeenCalledTimes(1); // Should be called once for local.pdf page 1
|
||||
expect(secondMockGetPage).toHaveBeenCalledTimes(2);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
// --- Error Handling Tests ---
|
||||
|
||||
it('should return error if local file not found', async () => {
|
||||
const error = new Error('Mock ENOENT') as NodeJS.ErrnoException;
|
||||
error.code = 'ENOENT';
|
||||
mockReadFile.mockRejectedValue(error);
|
||||
const args = { sources: [{ path: 'nonexistent.pdf' }] };
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'nonexistent.pdf',
|
||||
success: false,
|
||||
error: `MCP error -32600: File not found at 'nonexistent.pdf'.`, // Corrected expected error message
|
||||
},
|
||||
],
|
||||
};
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should return error if pdfjs fails to load document', async () => {
|
||||
const loadError = new Error('Mock PDF loading failed');
|
||||
const failingLoadingTask = { promise: Promise.reject(loadError) };
|
||||
mockGetDocument.mockReturnValue(failingLoadingTask);
|
||||
const args = { sources: [{ path: 'bad.pdf' }] };
|
||||
const result = await handler(args);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Check that the error message includes the source description
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: Failed to load PDF document from bad.pdf. Reason: ${loadError.message}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw McpError for invalid input arguments (Zod error)', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf' }], include_full_text: 'yes' };
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
/Invalid arguments: include_full_text \(Expected boolean, received string\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
// Test case for the initial Zod parse failure
|
||||
it('should throw McpError if top-level argument parsing fails', async () => {
|
||||
const invalidArgs = { invalid_prop: true }; // Completely wrong structure
|
||||
await expect(handler(invalidArgs)).rejects.toThrow(McpError);
|
||||
await expect(handler(invalidArgs)).rejects.toThrow(/Invalid arguments: sources \(Required\)/); // Example Zod error
|
||||
await expect(handler(invalidArgs)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
// Updated test: Expect Zod validation to throw McpError directly
|
||||
it('should throw McpError for invalid page specification string (Zod)', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf', pages: '1,abc,3' }] };
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
/Invalid arguments: sources.0.pages \(Page string must contain only numbers, commas, and hyphens.\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
// Updated test: Expect Zod validation to throw McpError directly
|
||||
it('should throw McpError for invalid page specification array (non-positive - Zod)', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf', pages: [1, 0, 3] }] };
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
/Invalid arguments: sources.0.pages.1 \(Number must be greater than 0\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
// Test case for resolvePath failure within the catch block
|
||||
it('should return error if resolvePath fails', async () => {
|
||||
const resolveError = new Error('Mock resolvePath failed');
|
||||
vi.spyOn(pathUtils, 'resolvePath').mockImplementation(() => {
|
||||
throw resolveError;
|
||||
});
|
||||
const args = { sources: [{ path: 'some/path' }] };
|
||||
const result = await handler(args);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Error now includes MCP code and different phrasing
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: Failed to prepare PDF source some/path. Reason: ${resolveError.message}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
// Test case for the final catch block with a generic error
|
||||
it('should handle generic errors during processing', async () => {
|
||||
const genericError = new Error('Something unexpected happened');
|
||||
mockReadFile.mockRejectedValue(genericError); // Simulate error after path resolution
|
||||
const args = { sources: [{ path: 'generic/error/path' }] };
|
||||
const result = await handler(args);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Error now includes MCP code and different phrasing
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: Failed to prepare PDF source generic/error/path. Reason: ${genericError.message}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
// Test case for the final catch block with a non-Error object
|
||||
it('should handle non-Error exceptions during processing', async () => {
|
||||
const nonError = { message: 'Just an object', code: 'UNEXPECTED' };
|
||||
mockReadFile.mockRejectedValue(nonError); // Simulate error after path resolution
|
||||
const args = { sources: [{ path: 'non/error/path' }] };
|
||||
const result = await handler(args);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Use JSON.stringify for non-Error objects
|
||||
// Error now includes MCP code and different phrasing, and stringifies [object Object]
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: Failed to prepare PDF source non/error/path. Reason: [object Object]`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should include warnings for requested pages exceeding total pages', async () => {
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf', pages: [1, 4, 5] }],
|
||||
include_page_count: true,
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'test.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
num_pages: 3,
|
||||
page_texts: [{ page: 1, text: 'Mock page text 1' }],
|
||||
warnings: ['Requested page numbers 4, 5 exceed total pages (3).'],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(mockGetPage).toHaveBeenCalledTimes(1);
|
||||
expect(mockGetPage).toHaveBeenCalledWith(1);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle errors during page processing gracefully when specific pages are requested', async () => {
|
||||
// Removed unnecessary async and eslint-disable comment
|
||||
mockGetPage.mockImplementation((pageNum: number) => {
|
||||
if (pageNum === 1)
|
||||
return {
|
||||
getTextContent: vi.fn().mockResolvedValueOnce({ items: [{ str: `Mock page text 1` }] }),
|
||||
};
|
||||
if (pageNum === 2) throw new Error('Failed to get page 2');
|
||||
if (pageNum === 3)
|
||||
return {
|
||||
getTextContent: vi.fn().mockResolvedValueOnce({ items: [{ str: `Mock page text 3` }] }),
|
||||
};
|
||||
throw new Error(`Mock getPage error: Invalid page number ${String(pageNum)}`);
|
||||
});
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf', pages: [1, 2, 3] }],
|
||||
};
|
||||
const result = await handler(args);
|
||||
const expectedData = {
|
||||
results: [
|
||||
{
|
||||
source: 'test.pdf',
|
||||
success: true,
|
||||
data: {
|
||||
info: { PDFFormatVersion: '1.7', Title: 'Mock PDF' },
|
||||
metadata: { 'dc:format': 'application/pdf' },
|
||||
num_pages: 3,
|
||||
page_texts: [
|
||||
{ page: 1, text: 'Mock page text 1' },
|
||||
{ page: 2, text: 'Error processing page: Failed to get page 2' },
|
||||
{ page: 3, text: 'Mock page text 3' },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
expect(mockGetPage).toHaveBeenCalledTimes(3);
|
||||
// Add check for content existence and access safely
|
||||
expect(result.content).toBeDefined();
|
||||
expect(result.content.length).toBeGreaterThan(0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
expect(JSON.parse(result.content[0].text) as ExpectedResultType).toEqual(expectedData);
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
|
||||
it('should return error if pdfjs fails to load document from URL', async () => {
|
||||
const testUrl = 'http://example.com/bad-url.pdf';
|
||||
const loadError = new Error('Mock URL PDF loading failed');
|
||||
const failingLoadingTask = { promise: Promise.reject(loadError) };
|
||||
// Ensure getDocument is mocked specifically for this URL
|
||||
mockGetDocument.mockReset(); // Reset previous mocks if necessary
|
||||
// Explicitly type source as unknown and use stricter type guards/assertions
|
||||
mockGetDocument.mockImplementation((source: unknown) => {
|
||||
if (
|
||||
typeof source === 'object' &&
|
||||
source !== null &&
|
||||
Object.prototype.hasOwnProperty.call(source, 'url') && // Use safer check
|
||||
typeof (source as { url?: unknown }).url === 'string' && // Assert type for check
|
||||
(source as { url: string }).url === testUrl // Assert type for comparison
|
||||
) {
|
||||
return failingLoadingTask;
|
||||
}
|
||||
// Fallback for other potential calls in the test suite
|
||||
const mockDocumentAPI = { numPages: 1, getMetadata: vi.fn(), getPage: vi.fn() };
|
||||
return { promise: Promise.resolve(mockDocumentAPI) };
|
||||
});
|
||||
|
||||
const args = { sources: [{ url: testUrl }] };
|
||||
const result = await handler(args);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].source).toBe(testUrl); // Check source description (line 168)
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: Failed to load PDF document. Reason: ${loadError.message}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// --- Additional Coverage Tests ---
|
||||
|
||||
it('should not include page count when include_page_count is false', async () => {
|
||||
const args = {
|
||||
sources: [{ path: 'test.pdf' }],
|
||||
include_page_count: false, // Explicitly false
|
||||
include_metadata: false, // Keep it simple
|
||||
include_full_text: false,
|
||||
};
|
||||
const result = await handler(args);
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]?.data) {
|
||||
expect(parsedResult.results[0].success).toBe(true);
|
||||
expect(parsedResult.results[0].data).not.toHaveProperty('num_pages');
|
||||
expect(parsedResult.results[0].data).not.toHaveProperty('metadata');
|
||||
expect(parsedResult.results[0].data).not.toHaveProperty('info');
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
expect(mockGetMetadata).not.toHaveBeenCalled(); // Because include_metadata is false
|
||||
});
|
||||
|
||||
it('should handle ENOENT error where resolvePath also fails in catch block', async () => {
|
||||
const enoentError = new Error('Mock ENOENT') as NodeJS.ErrnoException;
|
||||
enoentError.code = 'ENOENT';
|
||||
const resolveError = new Error('Mock resolvePath failed in catch');
|
||||
const targetPath = 'enoent/and/resolve/fails.pdf';
|
||||
|
||||
// Mock resolvePath: first call succeeds, second call (in catch) fails
|
||||
vi.spyOn(pathUtils, 'resolvePath')
|
||||
.mockImplementationOnce((p) => p) // First call succeeds
|
||||
.mockImplementationOnce(() => {
|
||||
// Second call throws
|
||||
throw resolveError;
|
||||
});
|
||||
|
||||
mockReadFile.mockRejectedValue(enoentError);
|
||||
|
||||
const args = { sources: [{ path: targetPath }] };
|
||||
const result = await handler(args);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Check for the specific error message from lines 323-324
|
||||
// Error message changed due to refactoring of the catch block
|
||||
expect(parsedResult.results[0].error).toBe(
|
||||
`MCP error -32600: File not found at '${targetPath}'.`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
|
||||
// Ensure readFile was called with the path that resolvePath initially returned
|
||||
expect(mockReadFile).toHaveBeenCalledWith(targetPath);
|
||||
// Ensure resolvePath was called twice (once before readFile, once in catch)
|
||||
expect(pathUtils.resolvePath).toHaveBeenCalledTimes(1); // Only called once before readFile attempt
|
||||
});
|
||||
|
||||
// --- Additional Error Coverage Tests ---
|
||||
|
||||
it('should return error for invalid page range string (e.g., 5-3)', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf', pages: '1,5-3,7' }] };
|
||||
const result = await handler(args); // Expect promise to resolve
|
||||
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
||||
if (result.content?.[0]) {
|
||||
const parsedResult = JSON.parse(result.content[0].text) as ExpectedResultType;
|
||||
expect(parsedResult.results[0]).toBeDefined();
|
||||
if (parsedResult.results[0]) {
|
||||
expect(parsedResult.results[0].success).toBe(false);
|
||||
// Error message changed slightly due to refactoring
|
||||
expect(parsedResult.results[0].error).toMatch(
|
||||
/Invalid page specification for source test.pdf: Invalid page range values: 5-3/
|
||||
);
|
||||
// Check the error code embedded in the message if needed, or just the message content
|
||||
}
|
||||
} else {
|
||||
expect.fail('result.content[0] was undefined');
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw McpError for invalid page number string (e.g., 1,a,3)', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf', pages: '1,a,3' }] };
|
||||
// Zod catches this first due to refine
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
// Escaped backslash for JSON
|
||||
/Invalid arguments: sources.0.pages \(Page string must contain only numbers, commas, and hyphens.\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
// Test Zod refinement for path/url exclusivity
|
||||
it('should throw McpError if source has both path and url', async () => {
|
||||
const args = { sources: [{ path: 'test.pdf', url: 'http://example.com' }] };
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
// Escaped backslash for JSON
|
||||
/Invalid arguments: sources.0 \(Each source must have either 'path' or 'url', but not both.\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
|
||||
it('should throw McpError if source has neither path nor url', async () => {
|
||||
const args = { sources: [{ pages: [1] }] }; // Missing path and url
|
||||
await expect(handler(args)).rejects.toThrow(McpError);
|
||||
await expect(handler(args)).rejects.toThrow(
|
||||
// Escaped backslash for JSON
|
||||
/Invalid arguments: sources.0 \(Each source must have either 'path' or 'url', but not both.\)/
|
||||
);
|
||||
await expect(handler(args)).rejects.toHaveProperty('code', ErrorCode.InvalidParams);
|
||||
});
|
||||
}); // End top-level describe
|
||||
108
pdf-reader-mcp/test/pathUtils.test.ts
Normal file
108
pdf-reader-mcp/test/pathUtils.test.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
import { describe, it, expect } from 'vitest'; // Removed beforeEach, vi
|
||||
import path from 'path';
|
||||
import { resolvePath, PROJECT_ROOT } from '../src/utils/pathUtils.js'; // Add .js extension
|
||||
import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
|
||||
|
||||
// Mock PROJECT_ROOT for consistent testing if needed, or use the actual one
|
||||
// For this test, using the actual PROJECT_ROOT derived from process.cwd() is likely fine,
|
||||
// but be aware it depends on where the test runner executes.
|
||||
// If consistency across environments is critical, mocking might be better.
|
||||
// vi.mock('../src/utils/pathUtils', async (importOriginal) => {
|
||||
// const original = await importOriginal();
|
||||
// return {
|
||||
// ...original,
|
||||
// PROJECT_ROOT: '/mock/project/root', // Example mock path
|
||||
// };
|
||||
// });
|
||||
|
||||
describe('resolvePath Utility', () => {
|
||||
it('should resolve a valid relative path correctly', () => {
|
||||
const userPath = 'some/file.txt';
|
||||
const expectedPath = path.resolve(PROJECT_ROOT, userPath);
|
||||
expect(resolvePath(userPath)).toBe(expectedPath);
|
||||
});
|
||||
|
||||
it('should resolve paths with "." correctly', () => {
|
||||
const userPath = './some/./other/file.txt';
|
||||
const expectedPath = path.resolve(PROJECT_ROOT, 'some/other/file.txt');
|
||||
expect(resolvePath(userPath)).toBe(expectedPath);
|
||||
});
|
||||
|
||||
it('should resolve paths with ".." correctly within the project root', () => {
|
||||
const userPath = 'some/folder/../other/file.txt';
|
||||
const expectedPath = path.resolve(PROJECT_ROOT, 'some/other/file.txt');
|
||||
expect(resolvePath(userPath)).toBe(expectedPath);
|
||||
});
|
||||
|
||||
it('should throw McpError for path traversal attempts', () => {
|
||||
const userPath = '../outside/secret.txt';
|
||||
expect(() => resolvePath(userPath)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPath)).toThrow('Path traversal detected. Access denied.');
|
||||
try {
|
||||
resolvePath(userPath);
|
||||
} catch (e) {
|
||||
expect(e).toBeInstanceOf(McpError);
|
||||
expect((e as McpError).code).toBe(ErrorCode.InvalidRequest);
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw McpError for path traversal attempts even if seemingly valid', () => {
|
||||
// Construct a path that uses '..' many times to try and escape
|
||||
const levelsUp = PROJECT_ROOT.split(path.sep).filter(Boolean).length + 2; // Go up more levels than the root has
|
||||
const userPath = path.join(...(Array(levelsUp).fill('..') as string[]), 'secret.txt'); // Cast array to string[]
|
||||
expect(() => resolvePath(userPath)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPath)).toThrow('Path traversal detected. Access denied.');
|
||||
try {
|
||||
resolvePath(userPath);
|
||||
} catch (e) {
|
||||
expect(e).toBeInstanceOf(McpError);
|
||||
expect((e as McpError).code).toBe(ErrorCode.InvalidRequest);
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw McpError for absolute paths', () => {
|
||||
const userPath = path.resolve(PROJECT_ROOT, 'absolute/file.txt'); // An absolute path
|
||||
const userPathPosix = '/absolute/file.txt'; // POSIX style absolute path
|
||||
const userPathWin = 'C:\\absolute\\file.txt'; // Windows style absolute path
|
||||
|
||||
expect(() => resolvePath(userPath)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPath)).toThrow('Absolute paths are not allowed.');
|
||||
|
||||
// Test specifically for POSIX and Windows style absolute paths if needed
|
||||
if (path.sep === '/') {
|
||||
// POSIX-like
|
||||
expect(() => resolvePath(userPathPosix)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPathPosix)).toThrow('Absolute paths are not allowed.');
|
||||
} else {
|
||||
// Windows-like
|
||||
expect(() => resolvePath(userPathWin)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPathWin)).toThrow('Absolute paths are not allowed.');
|
||||
}
|
||||
|
||||
try {
|
||||
resolvePath(userPath);
|
||||
} catch (e) {
|
||||
expect(e).toBeInstanceOf(McpError);
|
||||
expect((e as McpError).code).toBe(ErrorCode.InvalidParams);
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw McpError for non-string input', () => {
|
||||
// Corrected line number for context
|
||||
const userPath = 123 as unknown as string; // Use unknown then cast to string for test
|
||||
expect(() => resolvePath(userPath)).toThrow(McpError);
|
||||
expect(() => resolvePath(userPath)).toThrow('Path must be a string.');
|
||||
try {
|
||||
resolvePath(userPath);
|
||||
} catch (e) {
|
||||
expect(e).toBeInstanceOf(McpError);
|
||||
expect((e as McpError).code).toBe(ErrorCode.InvalidParams);
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle empty string input', () => {
|
||||
const userPath = '';
|
||||
const expectedPath = path.resolve(PROJECT_ROOT, ''); // Should resolve to the project root itself
|
||||
expect(resolvePath(userPath)).toBe(expectedPath);
|
||||
});
|
||||
});
|
||||
21
pdf-reader-mcp/tsconfig.eslint.json
Normal file
21
pdf-reader-mcp/tsconfig.eslint.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
// Extend the main tsconfig.json
|
||||
"extends": "./tsconfig.json",
|
||||
// Include source files AND test files for ESLint
|
||||
"include": [
|
||||
"src/**/*.ts",
|
||||
"test/**/*.ts",
|
||||
"eslint.config.js", // Include ESLint config itself if needed
|
||||
"vitest.config.ts",
|
||||
"commitlint.config.cjs",
|
||||
".prettierrc.cjs"
|
||||
// Add other JS/TS config files if necessary
|
||||
],
|
||||
// Exclude the same files as the main config, plus potentially others
|
||||
"exclude": [
|
||||
"node_modules",
|
||||
"dist",
|
||||
"coverage"
|
||||
// No need to exclude test files here as we want to lint them
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user