Friday, March 30, 2012

ajax-crawling approach to make gwt app crawlable works on default gwt example, but not on my app

Hi all,
I was reading https://developers.google.com/webmasters/ajax-crawling/
on how to make ajax apps (consequently gwt apps) crawlable.
I took the code from google (summarized in point 3 of "How to create
an HTML snapshot?" to create a filter (that returns html from ajax
using HtmlUnit) and changed the web.xml accordingly. I created a new
GWT project with example code and applied the filter and the web.xml
there. It worked directly.
However, I did exactly the same on the gwt app I want to make
searchable and it doesn't work. For some reason, the only requests the
filter gets are the ones to the ones for the rpc.
I think I must be missing a terribly simple detail, but I'm a bit lost
on where to go from here.


Following you can see the code for the filter (CrawlServlet) and the
web.xml

package crawltest.server;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.logging.Logger;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
* Servlet that makes this application crawlable
*/
public final class CrawlServlet implements Filter {

private static final Logger logger =
Logger.getLogger(CrawlServlet.class
.getName());
private static String rewriteQueryString(String queryString) throws
UnsupportedEncodingException {
StringBuilder queryStringSb = new StringBuilder(queryString);
int i = queryStringSb.indexOf("&_escaped_fragment_");
if (i != -1) {
StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
tmpSb.append("#!");
tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20,
queryStringSb.length()),"UTF-8"));
queryStringSb = tmpSb;
}

i = queryStringSb.indexOf("_escaped_fragment_");
if (i != -1) {
StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
tmpSb.append("#!");
tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19,
queryStringSb.length()), "UTF-8"));
queryStringSb = tmpSb;
}
if (queryStringSb.indexOf("#!") != 0) {
queryStringSb.insert(0, '?');
}
queryString = queryStringSb.toString();

return queryString;
}

private FilterConfig filterConfig = null;

/**
* Destroys the filter configuration
*/
public void destroy() {
this.filterConfig = null;
}

/**
* Filters all requests and invokes headless browser if necessary
*/
public void doFilter(ServletRequest request, ServletResponse
response,
FilterChain chain) throws IOException {
System.out.println("crawl");
if (filterConfig == null) {
return;
}
System.out.println("crawl");
HttpServletRequest req = (HttpServletRequest) request;
HttpServletResponse res = (HttpServletResponse) response;
String queryString = req.getQueryString();
System.out.println("query:"+queryString);
System.out.println("param:"+req.getParameterMap().toString());
System.out.println("req:"+req);
if ((queryString != null) &&
(queryString.contains("_escaped_fragment_"))) {
System.out.println("in!!");
StringBuilder pageNameSb = new StringBuilder("http://");
pageNameSb.append(req.getServerName());
if (req.getServerPort() != 0) {
pageNameSb.append(":");
pageNameSb.append(req.getServerPort());
}
pageNameSb.append(req.getRequestURI());
queryString = rewriteQueryString(queryString);
pageNameSb.append(queryString);

final WebClient webClient = new
WebClient(BrowserVersion.FIREFOX_3);
webClient.setJavaScriptEnabled(true);
String pageName = pageNameSb.toString();
HtmlPage page = webClient.getPage(pageName);
webClient.waitForBackgroundJavaScriptStartingBefore(2000);

res.setContentType("text/html;charset=UTF-8");
PrintWriter out = res.getWriter();
out.println("<hr>");
out.println("<center><h3>You are viewing a non-interactive page
that is intended for the crawler. You probably want to see this page:
<a href=\""
+ pageName + "\">" + pageName + "</a></h3></center>");
out.println("<hr>");

out.println(page.asXml());
webClient.closeAllWindows();
out.close();

} else {
try {
chain.doFilter(request, response);
} catch (ServletException e) {
e.printStackTrace();
}
}
}

/**
* Initializes the filter configuration
*/
public void init(FilterConfig filterConfig) {
this.filterConfig = filterConfig;
}

}


web-xml:

<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
version="2.5" xmlns="http://java.sun.com/xml/ns/javaee">

<filter>
<filter-name>CrawlServlet</filter-name>
<filter-class>crawltest.server.CrawlServlet</filter-class>
</filter>

<filter-mapping>
<filter-name>CrawlServlet</filter-name>
<url-pattern>/*</url-pattern>
</filter-mapping>

<!-- Servlets -->

<!-- Default page to serve -->
<welcome-file-list>
<welcome-file>CrawlTest.html</welcome-file>
</welcome-file-list>

</web-app>

--
You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group.
To post to this group, send email to google-web-toolkit@googlegroups.com.
To unsubscribe from this group, send email to google-web-toolkit+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/google-web-toolkit?hl=en.

No comments:

Post a Comment